This repository has been archived by the owner on Jan 23, 2023. It is now read-only.
forked from vbarnych/test_task
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweather_processing.py
116 lines (86 loc) · 4.92 KB
/
weather_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
Created on Mon Feb 7 14:10:42 2022
@author: Okhrimchuk Roman
for Sierentz Global Merchants
Test task
"""
# TODO Import the necessary libraries
import pandas as pd
import numpy as np
# TODO Import the dataset
path = r'./data/weather_dataset.data'
data_orig = pd.read_csv(path, sep='\s+', encoding='utf8')
# TODO Assign it to a variable called data and replace the first 3 columns by a proper datetime index
# TODO Write a function in order to fix date (this relate only to the year info) and apply it
# TODO Set the right dates as the index. Pay attention at the data type, it should be datetime64[ns]
data = data_orig
def fix_date(df, column_names, century = '19'):
date = df[column_names].astype('string')
date = century + date.apply(lambda x: '/'.join(x.values), axis=1)
date = pd.to_datetime(date, format='%Y/%m/%d')
return date
date_columns = ['Yr', 'Mo', 'Dy']
date_column = fix_date(data, date_columns)
data = data.set_index(pd.DatetimeIndex(date_column))
data = data.drop(date_columns, axis=1)
# TODO Check if everything is okay with the data. Create functions to delete/fix rows with strange cases and apply them
# get all badly converted rows
def get_corrupted_numeric_columns(df):
corrupted_data_mask = df.apply(pd.to_numeric, errors='coerce').apply(np.isnan).any(axis=1)
return df[corrupted_data_mask]
# fix numbers with comma separated digits
def fix_comma_numeric_columns(df):
df = df.replace(',', '.', regex=True)
return df
# fix na cells by replacing it by row average
def fix_na_numeric_columns(df):
df = df.apply(lambda row: row.fillna(row.mean()), axis=1)
return df
# check for outliers (like -123 for loc11)
IQR_RATE = 12
def fix_outlier_numeric_values(df):
q1 = df.quantile(.25, axis=1)
q3 = df.quantile(.75, axis=1)
iqr = q3 - q1
mask = df.apply(lambda row: row.between(q1 - IQR_RATE * iqr, q3 + IQR_RATE * iqr))
df = df[mask]
return df
data_to_fix = get_corrupted_numeric_columns(data)
data_to_fix = fix_comma_numeric_columns(data_to_fix)
data.loc[data_to_fix.index] = data_to_fix
data = data.apply(pd.to_numeric, errors='coerce')
data = fix_outlier_numeric_values(data)
data = fix_na_numeric_columns(data)
# TODO Compute how many values are missing for each location over the entire record
missing_per_location = data.isna().sum()
# TODO Compute how many non-missing values there are in total
non_missing_total = (~data.isna().all(axis=1)).sum()
# TODO Calculate the mean windspeeds of the windspeeds over all the locations and all the times
general_mean = data.mean(axis=1).mean()
# TODO Create a DataFrame called loc_stats and calculate the min, max and mean windspeeds and standard deviations of the windspeeds at each location over all the days
loc_stats = data.describe().loc[['min','max', 'mean', 'std']].T
# TODO Find the average windspeed in January for each location
month_filter = data.index.month_name() == 'January'
january_mean = data.loc[month_filter].mean()
# TODO Downsample the record to a yearly frequency for each location
yearly = data.resample('Y').mean()
# TODO Downsample the record to a monthly frequency for each location
monthly = data.resample('M').mean()
# TODO Downsample the record to a weekly frequency for each location
weekly = data.resample('W').mean()
# TODO Calculate the min, max and mean windspeeds and standard deviations of the windspeeds across all locations for each week (assume that the first week starts on January 2 1961) for the first 21 weeks
def calc_stat(resample, func_name, n=21):
resampling_res = getattr(resample, func_name)().head(n)
return getattr(resampling_res, func_name)(axis=1)
resample = data.resample('W-MON')
first_weeks_stats = pd.DataFrame({'min': calc_stat(resample, 'min'), 'max': calc_stat(resample, 'max'), 'mean': calc_stat(resample, 'mean'), 'std': calc_stat(resample, 'std')})
# make prints of results (if required, can be forwarded to some file later)
print('\n\nCompute how many values are missing for each location over the entire record\n', missing_per_location)
print('\n\nCompute how many non-missing values there are in total: ', non_missing_total)
print('\n\nCalculate the mean windspeeds of the windspeeds over all the locations and all the times:', general_mean)
print('\n\nCreate a DataFrame called loc_stats and calculate the min, max and mean windspeeds and standard deviations of the windspeeds at each location over all the days\n', loc_stats)
print('\n\nFind the average windspeed in January for each location\n', january_mean)
print('\n\nDownsample the record to a yearly frequency for each location\n', yearly)
print('\n\nDownsample the record to a monthly frequency for each location\n', monthly)
print('\n\nDownsample the record to a weekly frequency for each location\n', weekly)
print('\n\nCalculate the min, max and mean windspeeds and standard deviations of the windspeeds across all locations for each week (assume that the first week starts on January 2 1961) for the first 21 weeks\n', first_weeks_stats)