-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch_data.py
66 lines (50 loc) · 2.15 KB
/
fetch_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import json
import os
import pandas as pd
import requests
from tqdm import tqdm
class DataFetcher:
"""send requests and get responses from the API.
Methods
-------
request_text(ids_per_request: int = 1000) -> pd.DataFrame
Request the data from the API, Add it the dataframe, and return it.
"""
def __init__(self) -> None:
self.__URL: str = "https://recruitment.aimtechnologies.co/ai-tasks"
def request_text(self, data_file_path: str, ids_per_request: int = 1000) -> pd.DataFrame:
"""Request the data from the API using ids, Add it the dataframe, and return it.
Parameters
----------
data_file_path: str
the location of the file containing data ids.
ids_per_request: int
Number of ids per request, the maximum number allowed is 1000.
default = 1000
Returns
-------
fetched_data: pd.DataFrame
A dataframe which contains fetched text and its dialect.
"""
data_reader = pd.read_csv(data_file_path,
header = 0,
dtype = str,
chunksize = ids_per_request)
fetched_data = pd.DataFrame(columns = ["text", "dialect"])
print("Fetching text data:")
for chunk in tqdm(data_reader):
ids_list = chunk["id"].to_list()
ids_json = json.dumps(ids_list)
response = requests.post(self.__URL, data=ids_json).json()
chunk["text"] = response.values()
text_data = chunk[['text','dialect']]
fetched_data = pd.concat([fetched_data, text_data], ignore_index = True)
print("Fetching completed Successfully!")
return fetched_data
if __name__ == "__main__":
data_fetcher = DataFetcher()
data_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Datasets")
data = data_fetcher.request_text(os.path.join(data_directory, "dialect_id_target.csv"))
print("Saving The dataframe: ", end="")
data.to_csv(os.path.join(data_directory, "dialect_dataset.csv"), index=False)
print("done!")