-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
167 lines (132 loc) · 5.09 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import os
import random
import mlflow
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.layers import Dense, InputLayer
from keras.models import Sequential
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
def reset_seeds():
"""
Reset the seeds for random number generators.
This function sets the seeds for the `os`, `tf.random`, `np.random`, and `random`
modules to ensure reproducibility in random number generations.
Parameters:
None
Returns:
None
"""
os.environ['PYTHONHASHSEED'] = str(42)
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)
def read_data():
"""
Reads the data from a CSV file and returns the feature matrix X and target vector y.
Returns:
X (pandas.DataFrame): The feature matrix of shape (n_samples, n_features).
y (pandas.Series): The target vector of shape (n_samples,).
"""
url = 'raw.githubusercontent.com'
username = 'my_user'
repository = 'MLOps_Cardiotocography/refs/heads/main/dataset'
file_name = 'fetal_health_reduced.csv'
#data = pd.read_csv(f'https://{url}/{username}/{repository}/main/{file_name}')
data = pd.read_csv(f'https://{url}/{username}/{repository}/{file_name}')
X = data.drop(["fetal_health"], axis=1)
y = data["fetal_health"]
return X, y
def process_data(X, y):
"""
Preprocesses the data by standardizing the feature values and splitting the
data into training and testing sets.
Parameters:
X (pandas.DataFrame): The input data containing the features.
y (pandas.Series): The target variable.
Returns:
X_train (pandas.DataFrame): The preprocessed training data.
X_test (pandas.DataFrame): The preprocessed testing data.
y_train (pandas.Series): The training labels.
y_test (pandas.Series): The testing labels.
"""
columns_names = list(X.columns)
scaler = preprocessing.StandardScaler()
X_df = scaler.fit_transform(X)
X_df = pd.DataFrame(X_df, columns=columns_names)
X_train, X_test, y_train, y_test = train_test_split(X_df,
y,
test_size=0.3,
random_state=42)
y_train = y_train - 1
y_test = y_test - 1
return X_train, X_test, y_train, y_test
def create_model(X):
"""
Creates a neural network model for classification based on the given input data.
Parameters:
X (numpy.ndarray): The input data array. It should have a shape of (num_samples,
num_features).
Returns:
tensorflow.keras.models.Sequential: The created neural network model.
"""
reset_seeds()
model = Sequential()
model.add(InputLayer(input_shape=(X.shape[1],)))
model.add(Dense(10, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
return model
def config_mlflow():
"""
Configures the MLflow settings for tracking experiments.
Sets the MLFLOW_TRACKING_USERNAME and MLFLOW_TRACKING_PASSWORD environment
variables to provide authentication for accessing the MLflow tracking server.
Sets the MLflow tracking URI to 'https://dagshub.com/my_user/mlops_cardiotocography.mlflow'
to specify the location where the experiment data will be logged.
Enables autologging of TensorFlow models by calling `mlflow.tensorflow.autolog()`.
This will automatically log the TensorFlow models, input examples, and model signatures
during training.
Parameters:
None
Returns:
None
"""
os.environ['MLFLOW_TRACKING_USERNAME'] = 'my_user'
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'my_token'
mlflow.set_tracking_uri('https://dagshub.com/my_user/mlops_cardiotocography.mlflow')
mlflow.tensorflow.autolog(log_models=True,
log_input_examples=True,
log_model_signatures=True)
def train_model(model, X_train, y_train, is_train=True):
"""
Train a machine learning model using the provided data.
Parameters:
- model: The machine learning model to train.
- X_train: The training data.
- y_train: The target labels.
- is_train: (optional) Flag indicating whether to register the
model with mlflow.
Defaults to True.
Returns:
None
"""
with mlflow.start_run(run_name='experiment_mlops_cardiotocography') as run:
model.fit(X_train,
y_train,
epochs=50,
validation_split=0.2,
verbose=3)
if is_train:
run_uri = f'runs:/{run.info.run_id}'
mlflow.register_model(run_uri, 'fetal_health')
if __name__ == "__main__":
X, y = read_data()
X_train, X_test, y_train, y_test = process_data(X, y)
model = create_model(X)
config_mlflow()
train_model(model, X_train, y_train)