-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathCode-15
87 lines (75 loc) · 2.82 KB
/
Code-15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import logging
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')
INDEX_URL = 'https://dynamic2.scrape.cuiqingcai.com/page/{page}'
TIME_OUT = 10
TOTAL_PAGE = 10
browser = webdriver.Chrome()
wait = WebDriverWait(browser, TIME_OUT)
# 启动Chrome的Headless模式
options=webdriver.ChromeOptions()
options.add_argument('--headless')
browser=webdriver.Chrome(options=options)
def scrape_page(url, condition, locator):
logging.info('scraping %s', url)
try:
browser.get(url)
wait.until(condition(locator))
except TimeoutException:
logging.error('error occurred while scraping %s', url, exc_info=True)
def scrape_index(page):
url = INDEX_URL.format(page=page)
scrape_page(url, condition=EC.visibility_of_all_elements_located,
locator=(By.CSS_SELECTOR, '#index .item'))
from urllib.parse import urljoin
def parse_index():
elements = browser.find_elements_by_css_selector('#index .item .name')
for element in elements:
href = element.get_attribute('href')
yield urljoin(INDEX_URL, href)
def scrapy_detail(url):
scrape_page(url,condition=EC.visibility_of_element_located,locator=(By.TAG_NAME,'h2'))
def parse_detail():
url=browser.current_url
name=browser.find_element_by_tag_name('h2').text
categories=[ element.text for element in browser.find_elements_by_css_selector('.categories button span')]
cover=browser.find_element_by_css_selector('.cover').get_attribute('src')
score=browser.find_element_by_class_name('score').text
drama=browser.find_element_by_css_selector('.drama p').text
return {
'url':url,
'name':name,
'categories':categories,
'cover':cover,
'score':score,
'drama':drama
}
from os import makedirs
from os.path import exists
import json
RESULTS_DIR='results2'
exists(RESULTS_DIR) or makedirs(RESULTS_DIR)
def save_data(data):
name = data.get('name')
data_path = f'{RESULTS_DIR}/{name}.json'
json.dump(data, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2)
def main():
try:
for page in range(1, TOTAL_PAGE + 1):
scrape_index(page)
detail_urls = parse_index()
for detail_url in list(detail_urls):
logging.info('get detail url %s',detail_url)
scrapy_detail(detail_url)
detail_data=parse_detail()
logging.info('details data %s', detail_data)
save_data(detail_data)
finally:
browser.close()
if __name__ == '__main__':
main()