Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
MR414N-ID
GitHub Repository: MR414N-ID/botku2
Path: blob/master/node_modules/@bochilteam/scraper/lib/esm/others/wikipedia.js
1126 views
1
import cheerio from 'cheerio';
2
import got from 'got';
3
import { ScraperError } from '../utils.js';
4
export default async function wikipedia(query, lang = 'id') {
5
const html = await getHtml(lang, query);
6
const $ = cheerio.load(html);
7
const title = $('#firstHeading > i').text().trim();
8
const img = getImgLink($('td.infobox-image > a.image > img[src]').attr('src'));
9
const articles = [];
10
let end = false;
11
let start = false;
12
$('#mw-content-text > div.mw-parser-output')
13
.children()
14
.map(function () {
15
if (/p|h[2-4]|div/.test(this.name) && !end) {
16
let text = '';
17
const h = /h[2-4]/.test(this.name);
18
const div = /div/.test(this.name);
19
const el = $(this);
20
if (h &&
21
/referen|Примечания|Notes_et_références/i.test(el.find('span.mw-headline').attr('id'))) {
22
return (end = true);
23
}
24
const math = $(this).find('span.mwe-math-element');
25
if (math.length) {
26
math.replaceWith($(`<span>${math
27
.text()
28
.trim()
29
.replace(/(.*displaystyle.*|\\n)/, '')}</span>`));
30
}
31
if (div &&
32
el.hasClass('thumb') &&
33
el.find('div.thumbinner > a > img[src]').length) {
34
text = getImgLink(el.find('div.thumbinner > a > img[src]').attr('src'));
35
}
36
else if (div && el.find('div > ol > li[id]').length) {
37
el.find('div > ol > li[id]').each(function () {
38
text += $(this).text().trim() + '\n';
39
});
40
}
41
else
42
text = el.text().trim();
43
if (!start && this.name === 'p' && !end && text)
44
start = true;
45
if (text && start && !el.find('div > ul > li').length) {
46
articles.push((h ? '\n' : '') + text);
47
}
48
}
49
return true;
50
});
51
return {
52
title,
53
img,
54
articles: articles.join('\n\n')
55
};
56
}
57
function isSupportLang(lang) {
58
return ['en', 'id'].includes(lang);
59
}
60
async function getHtml(lang, query) {
61
query = encodeURIComponent(query.trim());
62
const defaultLink = `https://${isSupportLang(lang) ? lang : 'id'}.wikipedia.org`;
63
let res = await got(defaultLink + '/wiki/' + query);
64
if (!(res.statusCode === 404))
65
return res.body;
66
const link = `${defaultLink}/w/index.php?${lang === 'id'
67
? `title=Istimewa:Pencarian&search=${query}&fulltext=1&ns0=1`
68
: `search=${query}&title=Special:Search&profile=advanced&fulltext=1&ns0=1`}`;
69
res = await got(link);
70
const html = res.body;
71
const $ = cheerio.load(html);
72
const results = [];
73
$('ul.mw-search-results > li.mw-search-result').each(function () {
74
var _a;
75
const link = (_a = $(this)
76
.find('div.mw-search-result-heading > a[href]')
77
.attr('href')) === null || _a === void 0 ? void 0 : _a.trim();
78
if (link)
79
results.push(encodeURI(link));
80
});
81
if (results[0])
82
return (await got(defaultLink + results[0])).body;
83
throw new ScraperError('404 Not Found!!');
84
}
85
function getImgLink(link = '') {
86
if (!/https:/i.test(link))
87
link = encodeURI('https:' + link);
88
return link;
89
}
90
//# sourceMappingURL=wikipedia.js.map
91