-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathindex.js
executable file
·110 lines (102 loc) · 3.11 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env node
const fs = require('fs');
const puppeteer = require('puppeteer');
const pageParser = require('./lib/parser/pageParser');
const articleParser = require('./lib/parser/articleParser');
const ignoreImgLoad = require('./lib/request/ignoreImgLoad');
let config = {
Board: 'Gossiping',
nowPage: 0,
age18: 1,
isHeadless: true,
interval: 5 //seconds
};
if (process.argv[2]) config.Board = process.argv[2];
if (process.argv[3]) config.nowPage = process.argv[3];
if (process.argv[4] === 'false') config.isHeadless = false;
// future work: yargs
console.log(
`Board ${config.Board} / Page ${
config.nowPage === 0 ? 'latest' : config.nowPage
} / headless? ${config.isHeadless}`
);
(async () => {
const browser = await puppeteer.launch({ headless: config.isHeadless });
const request = async config => {
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', ignoreImgLoad);
//ignore img stylesheet font ...
const crawler = Object.assign({}, config);
const currentTarget = `https://www.ptt.cc/bbs/${crawler.Board}/index${
crawler.nowPage
}.html`;
console.log(
`crawler requests to ${crawler.Board} / ${
crawler.nowPage === 0 ? 'latest' : crawler.nowPage
}...`
);
let p = await page.goto(currentTarget, {
waitUntil: 'domcontentloaded',
timeout: 0
});
if ((await p.status()) >= 400) {
console.log('此頁不存在');
//continue;
}
if (crawler.age18) {
await page.setCookie({
name: 'over18',
value: '1'
});
//不可以reload 因為頁面被跳轉了
await page.goto(currentTarget, {
waitUntil: 'domcontentloaded',
timeout: 50 * 1000
});
crawler.age18 -= 1; // only one time to reload cookie;
}
//For over18 board
const pageInfo = await page.evaluate(pageParser);
const articleInfo = [];
crawler.nowPage = pageInfo.pageNumber; //Now page;
for (let i = 0; i < pageInfo.links.length; i++) {
let article = await page.goto(pageInfo.links[i].link, {
waitUntil: 'domcontentloaded',
timeout: 0
});
if ((await article.status()) >= 400) {
console.log('此篇文章不存在');
continue;
}
articleInfo.push(await page.evaluate(articleParser));
}
page.close();
fs.promises
.mkdir(`data/${crawler.Board}`, { recursive: true })
.then(() =>
fs.promises
.writeFile(
`./data/${crawler.Board}/${crawler.Board}_${crawler.nowPage}.json`,
JSON.stringify(articleInfo),
{ flag: 'w' }
)
.then(() =>
console.log(
`Saved as data/${crawler.Board}/${crawler.Board}_${
crawler.nowPage
}.json`
)
)
);
return crawler;
};
const { nowPage } = await request(config);
config.nowPage = nowPage;
// first time maybe get '0' but we need correct the page
setInterval(() => {
config.nowPage--;
request(config);
}, config.interval * 1000);
// Async Request / process(parse) / IO
})();