diff --git a/README.md b/README.md index 6335ebc..14cd1fe 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ -# bedmess +# BEDMS -bedmess is a tool used to standardize genomics/epigenomics metadata based on a schema chosen by the user ( eg. ENCODE, FAIRTRACKS). +BEDMS (BED Metadata Standardizer) is a tool used to standardize genomics/epigenomics metadata based on a schema chosen by the user ( eg. ENCODE, FAIRTRACKS, BEDBASE). To install `attribute-standardizer` , you need to clone this repository first. Follow the steps given below to install: ``` -git clone https://github.com/databio/bedmess.git +git clone https://github.com/databio/bedms.git -cd bedmess +cd bedms pip install . @@ -16,13 +16,28 @@ pip install . ## Usage -Using Python, this is how you can run `attribute_standardizer` : +Using Python, this is how you can run `attribute_standardizer` and print the results : ``` -from attribute_standardizer.attribute_standardizer import attr_standardizer +from attribute_standardizer import AttrStandardizer -attr_standardizer(pep=/path/to/pep, schema="ENCODE") +model = AttrStandardizer("ENCODE") +model = AttrStandardizer("FAIRTRACKS") + +results = model.standardize(pep ="geo/gse178283:default") + +print(results) + +``` + +To see the available schemas, you can run: +``` +schemas = model.get_available_schemas() + +print(schemas) ``` +This will print the available schemas as a list. + You can use the format provided in the `trial.py` script in this repository as a reference. \ No newline at end of file diff --git a/attribute_standardizer/__init__.py b/attribute_standardizer/__init__.py index e5081d0..374c0be 100644 --- a/attribute_standardizer/__init__.py +++ b/attribute_standardizer/__init__.py @@ -1 +1 @@ -from .attribute_standardizer import attr_standardizer +from .attr_standardizer import AttrStandardizer diff --git a/attribute_standardizer/attr_standardizer.py b/attribute_standardizer/attr_standardizer.py new file mode 100644 index 0000000..13bf949 --- /dev/null +++ b/attribute_standardizer/attr_standardizer.py @@ -0,0 +1,206 @@ +import logging +from typing import Dict, Tuple, Union + +import peppy +import torch +import torch.nn as nn +import torch.nn.functional as torch_functional + +from .const import ( + CONFIDENCE_THRESHOLD, + DROPOUT_PROB, + EMBEDDING_SIZE, + HIDDEN_SIZE, + INPUT_SIZE_BOW_BEDBASE, + INPUT_SIZE_BOW_ENCODE, + INPUT_SIZE_BOW_FAIRTRACKS, + OUTPUT_SIZE_BEDBASE, + OUTPUT_SIZE_ENCODE, + OUTPUT_SIZE_FAIRTRACKS, + SENTENCE_TRANSFORMER_MODEL, + PROJECT_NAME, +) +from .model import BoWSTModel +from .utils import ( + data_encoding, + data_preprocessing, + fetch_from_pephub, + get_any_pep, + load_from_huggingface, +) + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(PROJECT_NAME) + + +class AttrStandardizer: + def __init__(self, schema: str, confidence: int = CONFIDENCE_THRESHOLD) -> None: + """ + Initializes the attribute standardizer with user provided schema, loads the model. + + :param str schema: User provided schema, can be "ENCODE" or "FAIRTRACKS" + :param int confidence: Confidence threshold for the predictions. + """ + self.schema = schema + self.model = self._load_model() + self.conf_threshold = confidence + + def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: + """ + Get the model parameters as per the chosen schema. + + :return Tuple[int, int, int, int, int, int, float]: Tuple containing the model parameters. + """ + if self.schema == "ENCODE": + return ( + INPUT_SIZE_BOW_ENCODE, + EMBEDDING_SIZE, + EMBEDDING_SIZE, + HIDDEN_SIZE, + OUTPUT_SIZE_ENCODE, + DROPOUT_PROB, + ) + elif self.schema == "FAIRTRACKS": + return ( + INPUT_SIZE_BOW_FAIRTRACKS, + EMBEDDING_SIZE, + EMBEDDING_SIZE, + HIDDEN_SIZE, + OUTPUT_SIZE_FAIRTRACKS, + DROPOUT_PROB, + ) + elif self.schema == "BEDBASE": + return ( + INPUT_SIZE_BOW_BEDBASE, + EMBEDDING_SIZE, + EMBEDDING_SIZE, + HIDDEN_SIZE, + OUTPUT_SIZE_BEDBASE, + DROPOUT_PROB, + ) + else: + raise ValueError( + f"Schema not available: {self.schema}. Presently, three schemas are available: ENCODE , FAIRTRACKS, BEDBASE" + ) + + def _load_model(self) -> nn.Module: + """ + Calls function to load the model from HuggingFace repository and sets to eval(). + + :return nn.Module: Loaded Neural Network Model. + """ + try: + model = load_from_huggingface(self.schema) + state_dict = torch.load(model) + + ( + input_size_values, + input_size_values_embeddings, + input_size_headers, + hidden_size, + output_size, + dropout_prob, + ) = self._get_parameters() + + model = BoWSTModel( + input_size_values, + input_size_values_embeddings, + input_size_headers, + hidden_size, + output_size, + dropout_prob, + ) + model.load_state_dict(state_dict) + model.eval() + return model + + except Exception as e: + logger.error(f"Error loading the model: {str(e)}") + raise + + def standardize( + self, pep: Union[str, peppy.Project] + ) -> Dict[str, Dict[str, float]]: + """ + Fetches the user provided PEP from the PEPHub registry path, returns the predictions. + + :param str pep: peppy.Project object or PEPHub registry path to PEP. + :return Dict[str, Dict[str, float]]: Suggestions to the user. + """ + if isinstance(pep, str): + pep = get_any_pep(pep) + elif isinstance(pep, peppy.Project): + pass + else: + raise ValueError( + "PEP should be either a path to PEPHub registry or peppy.Project object." + ) + try: + csv_file = fetch_from_pephub(pep) + + X_values_st, X_headers_st, X_values_bow, num_rows = data_preprocessing( + csv_file + ) + ( + X_headers_embeddings_tensor, + X_values_embeddings_tensor, + X_values_bow_tensor, + label_encoder, + ) = data_encoding( + num_rows, + X_values_st, + X_headers_st, + X_values_bow, + self.schema, + model_name=SENTENCE_TRANSFORMER_MODEL, + ) + + logger.info("Data Preprocessing completed.") + + with torch.no_grad(): + outputs = self.model( + X_values_bow_tensor, + X_values_embeddings_tensor, + X_headers_embeddings_tensor, + ) + probabilities = torch_functional.softmax(outputs, dim=1) + + values, indices = torch.topk(probabilities, k=3, dim=1) + top_preds = indices.tolist() + top_confidences = values.tolist() + + decoded_predictions = [ + label_encoder.inverse_transform(indices) for indices in top_preds + ] + + suggestions = {} + for i, category in enumerate(X_headers_st): + category_suggestions = {} + if top_confidences[i][0] >= self.conf_threshold: + for j in range(3): + prediction = decoded_predictions[i][j] + probability = top_confidences[i][j] + if probability >= self.conf_threshold: + category_suggestions[prediction] = probability + else: + break + else: + category_suggestions["Not Predictable"] = 0.0 + + suggestions[category] = category_suggestions + + return suggestions + + except Exception as e: + logger.error( + f"Error occured during standardization in standardize function: {str(e)}" + ) + + @staticmethod + def get_available_schemas() -> list[str]: + """ + Stores a list of available schemas. + :return list: List of available schemas. + """ + schemas = ["ENCODE", "FAIRTRACKS", "BEDBASE"] + return schemas diff --git a/attribute_standardizer/attribute_standardizer.py b/attribute_standardizer/attribute_standardizer.py deleted file mode 100644 index fcc82d4..0000000 --- a/attribute_standardizer/attribute_standardizer.py +++ /dev/null @@ -1,148 +0,0 @@ -import pandas as pd -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -import logging -from .const import ( - HIDDEN_SIZE, - DROPOUT_PROB, - CONFIDENCE_THRESHOLD, - EMBEDDING_SIZE, - SENTENCE_TRANSFORMER_MODEL, -) - -from .utils import ( - fetch_from_pephub, - load_from_huggingface, - data_preprocessing, - data_encoding, -) -from .model import BoWSTModel -from huggingface_hub import hf_hub_download -from typing import Dict, List, Tuple, Any, Union - - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -def standardize_attr_names(csv_file: str, schema: str) -> Dict[str, Dict[str, float]]: - """ - Standardize attribute names. - - :param str csv_file: Path to the CSV file containing metadata to be standardized. - :param str schema: Schema type. - :return Dict[str, Dict[str, float]]: Suggestions for standardized attribute names. - """ - try: - X_values_st, X_headers_st, X_values_bow = data_preprocessing(csv_file) - ( - X_headers_embeddings_tensor, - X_values_embeddings_tensor, - X_values_bow_tensor, - label_encoder, - ) = data_encoding( - X_values_st, - X_headers_st, - X_values_bow, - schema, - model_name=SENTENCE_TRANSFORMER_MODEL, - ) - logger.info("Data Preprocessing completed.") - - model = load_from_huggingface(schema) - # print(model) - state_dict = torch.load(model) - - """Padding the input tensors.""" - - padded_data_values_tensor = torch.zeros( - X_values_bow_tensor.shape[0], state_dict["fc_values1.weight"].shape[1] - ) - padded_data_headers_tensor = torch.zeros( - X_headers_embeddings_tensor.shape[0], - state_dict["fc_headers1.weight"].shape[1], - ) - padded_data_values_embeddings_tensor = torch.zeros( - X_values_embeddings_tensor.shape[0], - state_dict["fc_values_embeddings1.weight"].shape[1], - ) - - padded_data_values_tensor[:, : X_values_bow_tensor.shape[1]] = ( - X_values_bow_tensor - ) - padded_data_headers_tensor[:, : X_headers_embeddings_tensor.shape[1]] = ( - X_headers_embeddings_tensor - ) - padded_data_values_embeddings_tensor[ - :, : X_values_embeddings_tensor.shape[1] - ] = X_values_embeddings_tensor - - input_size_values = padded_data_values_tensor.shape[1] - input_size_headers = EMBEDDING_SIZE - input_size_values_embeddings = EMBEDDING_SIZE - hidden_size = HIDDEN_SIZE - output_size = len(label_encoder.classes_) - dropout_prob = DROPOUT_PROB - model = BoWSTModel( - input_size_values, - input_size_values_embeddings, - input_size_headers, - hidden_size, - output_size, - dropout_prob, - ) - - model.load_state_dict(state_dict) - - model.eval() - - all_preds = [] - all_confidences = [] - with torch.no_grad(): - outputs = model( - padded_data_values_tensor, - padded_data_values_embeddings_tensor, - padded_data_headers_tensor, - ) - probabilities = F.softmax(outputs, dim=1) - confidence, predicted = torch.max(probabilities, 1) - all_preds.extend(predicted.tolist()) - all_confidences.extend(confidence.tolist()) - - decoded_predictions = label_encoder.inverse_transform(all_preds) - - suggestions = {} - for i, category in enumerate(X_headers_st): - if all_confidences[i] >= CONFIDENCE_THRESHOLD: - prediction = decoded_predictions[i] - probability = all_confidences[i] - else: - prediction = "Not Predictable" - probability = 0.0 - suggestions[category] = {prediction: probability} - - return suggestions - except Exception as e: - logger.error(f"Error occured in standardize_attr_names: {str(e)}") - return {} - - -def attr_standardizer(pep: str, schema: str) -> None: - """ - :param str pep: Path to the PEPhub registry containing the metadata csv file. - :param str schema: Schema Type chosen by the user. - """ - if not pep: - raise ValueError( - "pep argument is missing or empty. Please provide the PEPHub registry path to PEP" - ) - if not schema: - raise ValueError( - "schema argument is missing or empty. Please mention the schema of choice: ENCODE or FAIRTRACKS." - ) - csv_file = fetch_from_pephub(pep) - suggestions = standardize_attr_names(csv_file, schema) - - logger.info(suggestions) diff --git a/attribute_standardizer/const.py b/attribute_standardizer/const.py index 7ed657a..54e9b06 100644 --- a/attribute_standardizer/const.py +++ b/attribute_standardizer/const.py @@ -1,12 +1,24 @@ +PROJECT_NAME = "bedmess" + REPO_ID = "databio/attribute-standardizer-model6" -FILENAME_ENCODE = "model_encode.pth" -FILENAME_FAIRTRACKS = "model_fairtracks.pth" +MODEL_ENCODE = "model_encode.pth" +MODEL_FAIRTRACKS = "model_fairtracks.pth" +MODEL_BEDBASE = "model_bedbase.pth" ENCODE_VECTORIZER_FILENAME = "vectorizer_encode.pkl" FAIRTRACKS_VECTORIZER_FILENAME = "vectorizer_fairtracks.pkl" +BEDBASE_VECTORIZER_FILENAME = "vectorizer_bedbase.pkl" ENCODE_LABEL_ENCODER_FILENAME = "label_encoder_encode.pkl" FAIRTRACKS_LABEL_ENCODER_FILENAME = "label_encoder_fairtracks.pkl" +BEDBASE_LABEL_ENCODER_FILENAME = "label_encoder_bedbase.pkl" SENTENCE_TRANSFORMER_MODEL = "all-MiniLM-L6-v2" -HIDDEN_SIZE = 256 -DROPOUT_PROB = 0.203 -CONFIDENCE_THRESHOLD = 0.9 +HIDDEN_SIZE = 32 +DROPOUT_PROB = 0.113 +CONFIDENCE_THRESHOLD = 0.70 EMBEDDING_SIZE = 384 +INPUT_SIZE_BOW_ENCODE = 10459 +INPUT_SIZE_BOW_FAIRTRACKS = 13617 +OUTPUT_SIZE_FAIRTRACKS = 15 +OUTPUT_SIZE_ENCODE = 18 +NUM_CLUSTERS = 3 +INPUT_SIZE_BOW_BEDBASE = 13708 +OUTPUT_SIZE_BEDBASE = 12 diff --git a/attribute_standardizer/model.py b/attribute_standardizer/model.py index 23b9109..af212bc 100644 --- a/attribute_standardizer/model.py +++ b/attribute_standardizer/model.py @@ -29,18 +29,12 @@ def __init__( super(BoWSTModel, self).__init__() self.fc_values1 = nn.Linear(input_size_values, hidden_size) self.dropout_values1 = nn.Dropout(dropout_prob) - self.fc_values2 = nn.Linear(hidden_size, hidden_size) - self.dropout_values2 = nn.Dropout(dropout_prob) self.fc_values_embeddings1 = nn.Linear( input_size_values_embeddings, hidden_size ) self.dropout_values_embeddings1 = nn.Dropout(dropout_prob) - self.fc_values_embeddings2 = nn.Linear(hidden_size, hidden_size) - self.dropout_values_embeddings2 = nn.Dropout(dropout_prob) self.fc_headers1 = nn.Linear(input_size_headers, hidden_size) self.dropout_headers1 = nn.Dropout(dropout_prob) - self.fc_headers2 = nn.Linear(hidden_size, hidden_size) - self.dropout_headers2 = nn.Dropout(dropout_prob) self.fc_combined1 = nn.Linear(hidden_size * 3, hidden_size) self.dropout_combined1 = nn.Dropout(dropout_prob) self.fc_combined2 = nn.Linear(hidden_size, output_size) @@ -61,16 +55,10 @@ def forward( """ x_values = F.relu(self.fc_values1(x_values)) x_values = self.dropout_values1(x_values) - x_values = F.relu(self.fc_values2(x_values)) - x_values = self.dropout_values2(x_values) x_values_embeddings = F.relu(self.fc_values_embeddings1(x_values_embeddings)) x_values_embeddings = self.dropout_values_embeddings1(x_values_embeddings) - x_values_embeddings = F.relu(self.fc_values_embeddings2(x_values_embeddings)) - x_values_embeddings = self.dropout_values_embeddings2(x_values_embeddings) x_headers = F.relu(self.fc_headers1(x_headers)) x_headers = self.dropout_headers1(x_headers) - x_headers = F.relu(self.fc_headers2(x_headers)) - x_headers = self.dropout_headers2(x_headers) x_combined = torch.cat((x_values, x_values_embeddings, x_headers), dim=1) x_combined = F.relu(self.fc_combined1(x_combined)) diff --git a/attribute_standardizer/utils.py b/attribute_standardizer/utils.py index c16f025..aff492e 100644 --- a/attribute_standardizer/utils.py +++ b/attribute_standardizer/utils.py @@ -1,35 +1,49 @@ -import pandas as pd +import pickle +import warnings +from collections import Counter +from typing import Any, List, Optional, Tuple, Union + import numpy as np +import pandas as pd +import peppy import torch +from huggingface_hub import hf_hub_download from pephubclient import PEPHubClient from sentence_transformers import SentenceTransformer -import pickle -from sklearn.preprocessing import LabelEncoder +from sklearn.cluster import KMeans from sklearn.feature_extraction.text import CountVectorizer -from collections import Counter -from huggingface_hub import hf_hub_download -from typing import Optional, Any, List, Tuple, Union +from sklearn.preprocessing import LabelEncoder + from .const import ( - REPO_ID, - FILENAME_ENCODE, - FILENAME_FAIRTRACKS, + BEDBASE_LABEL_ENCODER_FILENAME, + BEDBASE_VECTORIZER_FILENAME, ENCODE_LABEL_ENCODER_FILENAME, - FAIRTRACKS_LABEL_ENCODER_FILENAME, ENCODE_VECTORIZER_FILENAME, + FAIRTRACKS_LABEL_ENCODER_FILENAME, FAIRTRACKS_VECTORIZER_FILENAME, - SENTENCE_TRANSFORMER_MODEL, + MODEL_BEDBASE, + MODEL_ENCODE, + MODEL_FAIRTRACKS, + NUM_CLUSTERS, + REPO_ID, +) + +# TODO : convert to single np array before converting to tensor +warnings.filterwarnings( + "ignore", + category=UserWarning, + message="Creating a tensor from a list of numpy.ndarrays is extremely slow.", ) -def fetch_from_pephub(pep: str) -> pd.DataFrame: +def fetch_from_pephub(project: peppy.Project) -> pd.DataFrame: """ Fetches metadata from PEPhub registry. :param str pep: Path to the PEPhub registry containing the metadata csv file :return pd.DataFrame: path to the CSV file on the local system. """ - phc = PEPHubClient() - project = phc.load_project(pep) + sample_table = project.sample_table csv_file_df = pd.DataFrame(sample_table) return csv_file_df @@ -43,15 +57,17 @@ def load_from_huggingface(schema: str) -> Optional[Any]: :return Optional[Any]: Loaded model object """ if schema == "ENCODE": - model = hf_hub_download(repo_id=REPO_ID, filename=FILENAME_ENCODE) + model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_ENCODE) elif schema == "FAIRTRACKS": - model = hf_hub_download(repo_id=REPO_ID, filename=FILENAME_FAIRTRACKS) + model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FAIRTRACKS) + elif schema == "BEDBASE": + model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_BEDBASE) return model def data_preprocessing( df: pd.DataFrame, -) -> Tuple[List[List[str]], List[str], List[List[str]]]: +) -> Tuple[List[List[str]], List[str], List[List[str]], int]: """ Preprocessing the DataFrame by extracting the column values and headers. @@ -60,13 +76,16 @@ def data_preprocessing( - Nested list containing the comma separated values in each column for sentence transformer embeddings. - List containing the headers of the DataFrame. - Nested list containing the comma separated values in each column for Bag of Words encoding. + - Number of rows in the metadata csv """ X_values_st = [df[column].astype(str).tolist() for column in df.columns] X_headers_st = df.columns.tolist() X_values_bow = [df[column].astype(str).tolist() for column in df.columns] - return X_values_st, X_headers_st, X_values_bow + num_rows = df.shape[0] + + return X_values_st, X_headers_st, X_values_bow, num_rows def get_top_k_average(val_embedding: List[np.ndarray], k: int) -> np.ndarray: @@ -89,7 +108,50 @@ def get_top_k_average(val_embedding: List[np.ndarray], k: int) -> np.ndarray: return column_embedding_mean.numpy() +def get_top_cluster_averaged(embeddings: List[np.ndarray]) -> np.ndarray: + """ + Calculates the average of the largest embedding cluster. + + :param list embeddings: List of embeddings, each embedding is a vector of values. + :return np.ndarray: The mean of the largest cluster as a NumPy array. + """ + flattened_embeddings = [embedding.tolist() for embedding in embeddings] + kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=0).fit(flattened_embeddings) + labels_kmeans = kmeans.labels_ + cluster_counts = Counter(labels_kmeans) + most_common_cluster = max(cluster_counts, key=cluster_counts.get) + most_common_indices = [ + idx for idx, label in enumerate(labels_kmeans) if label == most_common_cluster + ] + most_common_embeddings = [ + torch.tensor(embeddings[idx]) for idx in most_common_indices + ] + + if most_common_embeddings: + top_k_average = torch.mean( + torch.stack(most_common_embeddings), dim=0 + ).unsqueeze(0) + else: + top_k_average = torch.zeros_like(most_common_embeddings[0]).unsqueeze(0) + + return top_k_average.numpy() + + +def get_averaged(embeddings: List[np.ndarray]) -> np.ndarray: + """ + Averages the embeddings. + :param list embeddings: List of embeddings, each embedding is a vector of values. + :return np.ndarray: The mean of all the embeddings as a NumPy array. + """ + flattened_embeddings = [embedding.tolist() for embedding in embeddings] + flattened_embeddings_array = np.array(flattened_embeddings) + averaged_embedding = np.mean(flattened_embeddings_array, axis=0) + + return averaged_embedding + + def data_encoding( + num_rows: int, X_values_st: List[List[str]], X_headers_st: List[str], X_values_bow: List[List[str]], @@ -99,6 +161,7 @@ def data_encoding( """ Encode input data in accordance with the user-specified schemas. + :param int num_rows: Number of rows in the sample metadata :param list X_values_st: Nested list containing the comma separated values in each column for sentence transformer embeddings. :param list X_headers_st: List containing the headers of the DataFrame. :param list X_values_bow: Nested list containing the comma separated values in each column for Bag of Words encoding. @@ -114,7 +177,11 @@ def data_encoding( embeddings = [] for column in X_values_st: val_embedding = sentence_encoder.encode(column, show_progress_bar=False) - embedding = get_top_k_average(val_embedding, k=3) + if num_rows >= 10: + embedding = get_top_cluster_averaged(val_embedding) + else: + embedding = get_averaged(val_embedding) + embeddings.append(embedding) X_values_embeddings = embeddings if schema == "ENCODE": @@ -167,11 +234,36 @@ def data_encoding( with open(lb_path, "rb") as f: label_encoder = pickle.load(f) + elif schema == "BEDBASE": + vectorizer = CountVectorizer() + vc_path = hf_hub_download(repo_id=REPO_ID, filename=BEDBASE_VECTORIZER_FILENAME) + with open(vc_path, "rb") as f: + vectorizer = pickle.load(f) + transformed_columns = [] + for column in X_values_bow: + column_text = " ".join(column) + transformed_column = vectorizer.transform([column_text]) + transformed_columns.append(transformed_column.toarray()[0]) + transformed_columns = np.array(transformed_columns) + # print(transformed_columns) + X_values_bow = transformed_columns + # Label Encoding + label_encoder = LabelEncoder() + lb_path = hf_hub_download( + repo_id=REPO_ID, + filename=BEDBASE_LABEL_ENCODER_FILENAME, + ) + with open(lb_path, "rb") as f: + label_encoder = pickle.load(f) + X_headers_embeddings_tensor = torch.tensor( X_headers_embeddings, dtype=torch.float32 ) X_values_embeddings_tensor = torch.tensor(X_values_embeddings, dtype=torch.float32) X_values_bow_tensor = torch.tensor(X_values_bow, dtype=torch.float32) + X_values_embeddings_tensor = X_values_embeddings_tensor.squeeze( + 1 + ) # brings the shape to [num_cols, vocab] return ( X_headers_embeddings_tensor, @@ -179,3 +271,21 @@ def data_encoding( X_values_bow_tensor, label_encoder, ) + + +def get_any_pep(pep: str) -> peppy.Project: + """ + Get the PEP file from the local system or from PEPhub. + + :param pep: Path to the PEP file or PEPhub registry path. + + :return: peppy.Project object. + """ + + PEP_FILE_TYPES = ["yaml", "csv"] + + res = list(filter(pep.endswith, PEP_FILE_TYPES)) != [] + if res: + return peppy.Project(pep) + else: + return peppy.Project.from_pephub(pep) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 6642681..848e7e8 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -2,5 +2,5 @@ pandas numpy torch sentence-transformers -pephubclient - +pephubclient>=0.4.2 +peppy>=0.40.5 diff --git a/scripts/model1.py b/scripts/model1.py index bef41fb..0118add 100644 --- a/scripts/model1.py +++ b/scripts/model1.py @@ -29,7 +29,8 @@ class NN1(nn.Module): - """ Simple Neural Network with a single Hidden Layer.""" + """Simple Neural Network with a single Hidden Layer.""" + def __init__(self, input_size, hidden_size, output_size): """ Initializes the NN1 model. @@ -45,7 +46,7 @@ def __init__(self, input_size, hidden_size, output_size): def forward(self, x): """ - Defines the forward pass of the neural network. + Defines the forward pass of the neural network. :param torch.Tensor x: Input tensor. :return torch.Tensor: Output tensor after passing through the network. @@ -86,14 +87,14 @@ def data_split(df_values): df_values_temp, test_size=0.5, random_state=42 ) - #Snippet for testing on unseen data + # Snippet for testing on unseen data """ df_values_test = pd.read_csv( "/home/saanika/curation/scripts/bedmess_archive/data/encode_metadata_values_moderate.csv", sep=",", ) """ - #Comment out the above for training on seen data. + # Comment out the above for training on seen data. X_values_train = [ df_values_train[column].astype(str).tolist() @@ -135,9 +136,9 @@ def data_split(df_values): def encoding(X_values_train, X_values_test, X_values_val, y_train, y_test, y_val): """ - Encodes the values for the model. + Encodes the values for the model. - :param list X_values_train: Training features. + :param list X_values_train: Training features. :param list X_values_test: Testing features. :param list X_values_val: Validation features. :param list y_train: Training labels. diff --git a/trial.py b/trial.py index 160ae30..1df22e1 100644 --- a/trial.py +++ b/trial.py @@ -1,3 +1,12 @@ -from attribute_standardizer.attribute_standardizer import attr_standardizer +from attribute_standardizer.attr_standardizer import AttrStandardizer -attr_standardizer(pep="geo/gse178283:default", schema="ENCODE") +model = AttrStandardizer("ENCODE") + +schemas = model.get_available_schemas() + +print(schemas) + +# results = model.standardize(pep="geo/gse178283:default") +results = model.standardize(pep="geo/gse228634:default") + +print(results)