-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexplore.py
81 lines (71 loc) · 3.07 KB
/
explore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from scipy import stats
def train_validate_test_split(df, target, seed=123):
'''
This function takes in a dataframe, the name of the target variable
(for stratification purposes), and an integer for a setting a seed
and splits the data into train, validate and test.
Test is 20% of the original dataset, validate is .30*.80= 24% of the
original dataset, and train is .70*.80= 56% of the original dataset.
The function returns, in this order, train, validate and test dataframes.
'''
train_validate, test = train_test_split(df, test_size=0.2,
random_state=seed,
stratify=df[target])
train, validate = train_test_split(train_validate, test_size=0.3,
random_state=seed,
stratify=train_validate[target])
return train, validate, test
###########################################################################
###### UNIVARIATE EXPLORATION FUNCTIONS ############
def explore_univariate(train, cat_vars, quant_vars):
for var in cat_vars:
explore_univariate_categorical(train, var)
print('_________________________________________________________________')
for col in quant_vars:
p, descriptive_stats = explore_univariate_quant(train, col)
plt.show(p)
print(descriptive_stats)
def explore_univariate_categorical(train, cat_var):
'''
takes in a dataframe and a categorical variable and returns
a frequency table and barplot of the frequencies.
'''
frequency_table = freq_table(train, cat_var)
plt.figure(figsize=(2,2))
sns.barplot(x=cat_var, y='Count', data=frequency_table, color='white', edgecolor='grey')
plt.title(cat_var)
plt.show()
print(frequency_table)
def explore_univariate_quant(train, quant_var):
'''
takes in a dataframe and a quantitative variable and returns
descriptive stats table, histogram, and boxplot of the distributions.
'''
descriptive_stats = train[quant_var].describe()
plt.figure(figsize=(8,2))
p = plt.subplot(1, 2, 1)
p = plt.hist(train[quant_var], color='white', edgecolor='grey')
p = plt.title(quant_var)
# second plot: box plot
p = plt.subplot(1, 2, 2)
p = plt.boxplot(train[quant_var])
p = plt.title(quant_var)
return p, descriptive_stats
def freq_table(train, cat_var):
'''
for a given categorical variable, compute the frequency count and percent split
and return a dataframe of those values along with the different classes.
'''
class_labels = list(train[cat_var].unique())
frequency_table = (
pd.DataFrame({cat_var: class_labels,
'Count': train[cat_var].value_counts(normalize=False),
'Percent': round(train[cat_var].value_counts(normalize=True)*100,2)}
)
)
return frequency_table