-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract-neb-prices.py
31 lines (25 loc) · 1.15 KB
/
extract-neb-prices.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from bs4 import BeautifulSoup
import pandas
import glob
dataframes = []
for node in glob.glob("neb/*.html"):
soup = BeautifulSoup(open(node), "html.parser")
list = []
for i, item in enumerate(soup.find_all("div", "row item")):
all_divs = item.find_all("div")
name = all_divs[1].h2.text
articlenumber = all_divs[0].text.strip()
units = int(all_divs[1].span.text.replace("Menge: ","").replace(" units", "").replace(".",""))
price_in_euro = float(item.find_all("span")[2].text.replace(",", "."))
price_per_unit = price_in_euro / units
foobar = dict()
for j in ["name", "articlenumber", "units", "price_in_euro", "price_per_unit"]:
foobar.update( { j : locals()[j] } )
list.append(foobar)
print(list)
dataframes.append(pandas.DataFrame.from_dict(list))
merged_df = pandas.concat(dataframes, ignore_index=True)
merged_df["relative_price"] = merged_df.price_per_unit / merged_df.price_per_unit.min()
merged_df["relative_price"] = merged_df["relative_price"].round(1)
print(merged_df)
merged_df.sort_values(by="name").to_excel("neb-prices.xlsx")