-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathextractor.py
34 lines (24 loc) · 949 Bytes
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from bs4 import BeautifulSoup
import os
import re
data = os.listdir('data/')
file_names = ['book1', 'book2', 'book3', 'book4', 'book5', 'book6', 'book7']
def extract_text(file_path):
with open(file_path,'r') as file:
soup = BeautifulSoup(file.read(), 'html.parser')
return soup.pre.string
def write_data(text, file_name):
with open('final_data/'+file_name+'.txt', 'w') as file:
file.write(text)
def clean_data(text):
ctext = re.sub(r'/', ' ', text)
ctext = re.sub(r'P( )?a( )?g( )?e( )?\|( )?[0-9a-zA-Z]+( )?(\n)*Harry Potter [a-zA-Z ]+( )?-( )?J.K. Rowling', ' ', ctext)
ctext = ctext.replace('\n',' ')
ctext = ctext.lower()
ctext = ' '.join(word.strip('"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~') for word in ctext.split())
return ctext
for d,file in zip(data, file_names):
file_path = 'data/'+d
text = extract_text(file_path)
text = clean_data(text)
write_data(text, file)