-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclassification.py
98 lines (83 loc) · 3.19 KB
/
classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
import numpy as np
import sys
import csv
import matplotlib.pyplot as plt
import warnings
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_fscore_support as score
warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option('display.max_rows',300000)
np.set_printoptions(threshold=sys.maxsize)
"""
@author: Talessil
Building and evaluating classification Algorithms
input: fp_input_def.csv
output: fp_input_def_resul.csv
"""
""" CLASSIFICATION ALGORITHMS """
dados = pd.read_csv("fp_input_def.csv", sep=";", header=0)
X = dados
Y = dados['requested']
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.3, random_state=7)
models=[]
models.append(('LDA', LinearDiscriminantAnalysis()))
#models.append(('LR', LogisticRegression()))
#models.append(('CART', DecisionTreeClassifier()))
#models.append(('NB', GaussianNB()))
#models.append(('SVM', SVC(gamma='scale')))
results=[]
names=[]
for name, model in models:
model.fit(X_train, Y_train)
kfold = model_selection.KFold(n_splits=10, random_state=7)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
results.append(cv_results)
names.append(name)
preds = model.predict(X_test)
precision, recall, fscore, support = score(Y_test, preds)
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
print(pd.crosstab(Y_test, preds, rownames=['Actual Result'], colnames=['Predicted Result']))
#save results
size = 0
for n in X_test.values:
size = size + 1
aux = X_test.values
with open('fp_input_def_resul.csv', mode='w') as result:
result = csv.writer(result, delimiter=';', quotechar='"', quoting=csv.QUOTE_NONE,lineterminator = '\n')
result.writerow(['author_id', 'discussion', 'review', 'qntags', 'pull', 'preds'])
for k in range(size):
result.writerow([str(aux[k][0]),str(aux[k][1]),str(aux[k][2]),str(aux[k][3]),str(aux[k][4]),str(preds[k])])
#show false positives
dados = pd.read_csv("fp_input_def.csv", sep=";", header=0)
array = dados.values
dados2 = pd.read_csv("fp_input_def_resul.csv", sep=";", header=0)
array2 = dados2.values
size = 0
for n in array:
size = size + 1
size2 = 0
for n in array2:
size2 = size2 +1
for l in range(size2):
for k in range(size):
if array[k][0] == array2[l][0]: # if the id is the same
if array2[l][5] == 0 and array[k][5] == 1: # if it is false positive
print(array2[l][0])
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()