Update createEmbedding.py

AILAB-CEFET-RJ · Sep 9, 2024 · dfcdbf5 · dfcdbf5
1 parent 0d91fd8
commit dfcdbf5
Showing 1 changed file with 97 additions and 160 deletions.
diff --git a/src/createEmbedding.py b/src/createEmbedding.py
@@ -14,206 +14,143 @@
 import string
 import torch
 
-
 verbose = False
 
-
-def select_important(doc,mark):
+def extract_relevant_text(doc, marker):
     doc = doc.lower()
-    match = re.search(fr' {mark}[^\n]*',doc)
+    match = re.search(fr' {marker}[^\n]*', doc)
     if match:
         start = match.start()
         final_doc = doc[start:]
     else:
         final_doc = doc
-    return final_doc    
-        #Palavras irrelevantes
+    return final_doc
+
 def remove_punctuation(text):
     translator = str.maketrans('', '', string.punctuation)
-    text_without_punctuation = text.translate(translator)
-    return text_without_punctuation
+    return text.translate(translator)
 
 def remove_stopwords(text):
     stop_words = set(stopwords.words('portuguese'))
-    stop_words.update(["nº","cep","telefone","rua","avenida","endereço","fax","fones"])
-    stop_words.update(["egrégia","egrégio","eg","e.g."])                   
+    stop_words.update(["nº", "cep", "telefone", "rua", "avenida", "endereço", "fax", "fones"])
+    stop_words.update(["egrégia", "egrégio", "eg", "e.g."])                   
+    stop_words.update(["copy", "reg", "trade", "ldquo", "rdquo", "lsquo", "rsquo", "bull", "middot", "sdot", "ndash"])
+    stop_words.update(["mdash", "cent", "pound", "euro", "ne", "frac12", "frac14", "frac34", "deg", "larr", "rarr", "uarr", "darr", "egrave", "eacute", "ccedil", "hellip"])
 
-    stop_words.update(["copy","reg","trade","ldquo","rdquo","lsquo","rsquo","bull","middot","sdot","ndash"])  
-    stop_words.update(["mdash","cent","pound","euro","ne","frac12","frac14","frac34","deg","larr","rarr","uarr","darr","egrave","eacute","ccedil","hellip"])
-    # Tokenizando o texto
     tokens = word_tokenize(text, language='portuguese')
     tokens_cleaned = [token for token in tokens if token not in stop_words]
     detokenizer = TreebankWordDetokenizer()
-    detokenized_text = detokenizer.detokenize(tokens_cleaned)
-    text_cleaned = ' '.join(tokens_cleaned)
-    return text_cleaned
+    return detokenizer.detokenize(tokens_cleaned)
 
-
 def clean_text(doc):
-    final_doc = doc
-    #Padrões irrelevantes
-    match_pattern = [r',\s*,',r'\bpágina\s+(\d+)\s+(\d+)\b',r'\bpágina\s+(\d+)\s+de\s+(\d+)\b',r'\?',r'\b_+(?:\d+|[a-zA-Z]+)?\b',r'https?://\S+',r'www\.\S+',r'\S+@\S+',r'^\d{3}.\d{3}.\d{3}-\d{2}$',r'^\d{2}\.\d{3}\.\d{3}\/\d{4}\-\d{2}$',r'\d{2}/\d{2}/\d{4}[ ,]',r'\bprocuradoria regional (federal|da união) da \d+[ªa] região\b',r'\btribunal regional federal( da) \d+[ªa] região\b',r'\badvocacia( -)geral da união\b',r'\b(excelentíssimo|senhor|vice-presidente|desembargador|\(a\))\b',r'\bprocuradoria[ -]geral federal\b',r'\bescritório de advocacia\b',r'\b(superior) tribunal de justiça\b',r'\bsupremo tribunal federal\b',r'\bfones\b',r'\bfax\b']
-    subs = [''] * len(match_pattern)
-    for match_pattern, subs in zip(match_pattern,subs):
-        final_doc = re.sub(match_pattern,subs,final_doc)
+    match_patterns = [
+        r',\s*,', r'\bpágina\s+(\d+)\s+(\d+)\b', r'\bpágina\s+(\d+)\s+de\s+(\d+)\b', r'\?', r'\b_+(?:\d+|[a-zA-Z]+)?\b',
+        r'https?://\S+', r'www\.\S+', r'\S+@\S+', r'^\d{3}.\d{3}.\d{3}-\d{2}$', r'^\d{2}\.\d{3}\.\d{3}\/\d{4}\-\d{2}$',
+        r'\d{2}/\d{2}/\d{4}[ ,]', r'\bprocuradoria regional (federal|da união) da \d+[ªa] região\b',
+        r'\btribunal regional federal( da) \d+[ªa] região\b', r'\badvocacia( -)geral da união\b',
+        r'\b(excelentíssimo|senhor|vice-presidente|desembargador|\(a\))\b', r'\bprocuradoria[ -]geral federal\b',
+        r'\bescritório de advocacia\b', r'\b(superior) tribunal de justiça\b', r'\bsupremo tribunal federal\b',
+        r'\bfones\b', r'\bfax\b'
+    ]
 
-    final_doc = remove_stopwords(final_doc)
-
-    return final_doc
+    final_doc = doc
+    for pattern in match_patterns:
+        final_doc = re.sub(pattern, '', final_doc)
 
-def process_corpus(path,clean,begin,column):
+    return remove_stopwords(final_doc)
+
+def process_corpus(file_path, clean, begin_point, column):
     try:
-        resp = pd.read_csv(path)
-        size = len(resp)
-        docs = []
-        num_cadastrado = []
-        indice = []
-        text = ""
-        print("############### Lendo registros do corpus ###############")
-        for i, linha in resp.iterrows():
-            #Exibe percentual concluido
-            sys.stdout.write("\r ")  # \r faz o cursor retroceder ao início da linha
-            sys.stdout.write(f' Percentual concluído: {i/size*100:.2f}%')
-            sys.stdout.flush()  # Força a impressão imediata
-            tipo = type(linha[column])
-            text = linha[column]
+        data = pd.read_csv(file_path)
+        size = len(data)
+        docs, registered_ids, indices = [], [], []
+        print("############### Reading corpus records ###############")
+
+        for i, row in data.iterrows():
+            sys.stdout.write(f'\r Progress: {i/size*100:.2f}%')
+            sys.stdout.flush()
+            text = row[column]
             try:
-                #tratamento necessário, textos estavam sendo identificados como float
-                if (tipo == str):
-                    #Se houver um marco de trecho relevante do texto
-                    if begin:
-                        text = select_important(text,begin)
-                        #print(text)
-                    #Se tiver de limpar o texto
+                if isinstance(text, str):
+                    if begin_point:
+                        text = extract_relevant_text(text, begin_point)
                     if clean:
-                        text_cleaned = clean_text(text)
-                        num_cadastrado.append(int(linha['num_tema_cadastrado']))
-                        #print(text_cleaned)
-                        docs.append(text_cleaned)
-                        indice.append(i)     
-                    else:
-                        num_cadastrado.append(int(linha['num_tema_cadastrado']))
-                        docs.append(text)
-                        indice.append(i)
-
-            except Exception as erro:
-                print(f"Erro ao capturar numero de tema cadastrado {i} : {erro}")
+                        text = clean_text(text)
+
+                    registered_ids.append(int(row['num_tema_cadastrado']))
+                    docs.append(text)
+                    indices.append(i)
+            except Exception as error:
+                print(f"Error processing record {i}: {error}")
                 continue
 
-        corpus = pd.DataFrame()
-        corpus["indice"]=indice
-        corpus["num_tema_cadastrado"]=num_cadastrado
-        corpus[column]=docs 
+        corpus_df = pd.DataFrame()
+        corpus_df["index"] = indices
+        corpus_df["num_tema_cadastrado"] = registered_ids
+        corpus_df[column] = docs
+        return corpus_df
 
-    except FileNotFoundError:
-        print(f"Error: File {path} not found")
-    except PermissionError:
-        print(f"Error: Permission denied for {path}")
-    except pd.errors.EmptyDataError:
-        print(f"Error: The path '{path}' is empty.")
-    except pd.errors.ParserError as e:
-        print(f"Error processing file: {e}")
-
-    return corpus
-
+    except (FileNotFoundError, PermissionError, pd.errors.EmptyDataError, pd.errors.ParserError) as e:
+        print(f"Error: {e}")
+        return None
 
-def createFileName(name,clean):
-    if clean:
-        return f'{name}_EMBEDDING_CLEAN.pkl'
-    else:
-        return f'{name}_EMBEDDING.pkl'
+def create_file_name(base_name, clean):
+    return f'{base_name}_EMBEDDING_CLEAN.pkl' if clean else f'{base_name}_EMBEDDING.pkl'
 
-def create_embedding(file,indice,corpus,num,model,v,c,data_type):
+def generate_embeddings(file, index, corpus, labels, model_name, verbose, clean, data_type):
+    sentence_model = SentenceTransformer(model_name)
+    embedding_file = create_file_name(file.name.split('.')[0], clean)
 
-    sentence_model = SentenceTransformer(model)
-    nameEmbedding = createFileName(file.name.split('.')[0],c)
-    if(data_type == 'tema'):
-        print("############### Gerando embeddings dos temas ###############")
-        i = 0
-        for linha in corpus:
-            i = i+1
-             #Exibe percentual concluido
-            sys.stdout.write("\r ")  # \r faz o cursor retroceder ao início da linha
-            sys.stdout.write(f' Percentual concluído: {i/len(corpus)*100:.2f}%')
-            sys.stdout.flush()  # Força a impressão imediata
-            tema_clean = remove_punctuation(linha)          
-            tokenized_temas = tema_clean.split(" ")
-        #verbose
-        if v:
-            corpus_embedding = sentence_model.encode(corpus,show_progress_bar=True)
-        else:
-            corpus_embedding = sentence_model.encode(corpus,show_progress_bar=False)
-
-        with open(nameEmbedding, "wb") as fOut:
-                pickle.dump({'indice':indice,'sentences': corpus,'numTema':num ,'embeddings': corpus_embedding}, fOut,protocol=pickle.HIGHEST_PROTOCOL)
-        print(f"Embedding salvo no arquivo {nameEmbedding}")
+    if data_type == 'tema':
+        print("############### Generating embeddings for topics ###############")
+
+    if verbose:
+        corpus_embeddings = sentence_model.encode(corpus, show_progress_bar=True)
     else:
-        #verbose
-        if v:
-            corpus_embedding = sentence_model.encode(corpus,show_progress_bar=True)
-        else:
-            corpus_embedding = sentence_model.encode(corpus,show_progress_bar=False)
+        corpus_embeddings = sentence_model.encode(corpus, show_progress_bar=False)
 
-        with open(nameEmbedding, "wb") as fOut:
-                pickle.dump({'indice':indice,'sentences': corpus,'numTema':num ,'embeddings': corpus_embedding}, fOut,protocol=pickle.HIGHEST_PROTOCOL)
-        print(f"Embedding salvo no arquivo {nameEmbedding}")
+    with open(embedding_file, "wb") as fOut:
+        pickle.dump({'index': index, 'sentences': corpus, 'numTema': labels, 'embeddings': corpus_embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)
 
+    print(f"Embeddings saved to {embedding_file}")
+
 def main(args):
-    conteudo = []
-    print("############### PROGRAMA DE GERAÇÃO DE EMBEDDINGS ###############")
-    print("############### Configuração ###############")
-    print(f"Sentence-BERT Model : {args.model}")
-    print(f"Remoção de stopwords : {args.clean}")
+    print("############### EMBEDDING GENERATION PROGRAM ###############")
+    print(f"Sentence-BERT Model: {args.model}")
+    print(f"Removing stopwords: {args.clean}")
+
     if args.begin_point:
-        print(f"Considerar texto após primeira ocorrência da palavra: {args.begin_point}")
-    tempo_inicio = time.time()
-    print("\n\n############### Fazendo download de dependências ... ###############")
+        print(f"Processing text after the first occurrence of: {args.begin_point}")
+
+    start_time = time.time()
     nltk.download('stopwords')
     nltk.download('punkt')
-    verbose = args.verbose
-    corpus = process_corpus(args.corpus_csv_file,args.clean,args.begin_point,args.column)
-
-    #Se não houver uma coluna indice, cria uma
-    #if 'indice' not in corpus.columns:
-        #corpus['indice'] = range(1,len(corpus)+1)
-
-    conteudo = corpus[args.column].tolist()
-    create_embedding(args.corpus_csv_file,corpus['indice'],conteudo,corpus['num_tema_cadastrado'],args.model,args.verbose,args.clean, args.data_type)
 
-    print("Salvando log ...")
-    tempo_fim = time.time()
-    tempo_total_segundos = tempo_fim - tempo_inicio
-    minutos, segundos = divmod(int(tempo_total_segundos), 60)
-    if args.clean:
-        log = 'log_corpus_embedding_clean.txt'
-        with open(log, 'w') as arquivo:
-            # Escrevendo os dados no arquivo
-            arquivo.write("############### Criação de Embedding do Corpus ###############")
-            arquivo.write(f"Sentence-BERT Model : {args.model}")
-            arquivo.write("Remoção de stopwords : Sim")
-            arquivo.write(f"Tempo total de execução: {minutos} minutos e {segundos} segundos")
-    else:
-        log = 'log_corpus_embedding.txt'
-        with open(log, 'w') as arquivo:
-            # Escrevendo os dados no arquivo
-            arquivo.write("############### Criação de Embedding do Corpus ###############")
-            arquivo.write(f"Sentence-BERT Model : {args.model}")
-            arquivo.write("Remoção de stopwords : Não")
-            arquivo.write(f"Tempo total de execução: {minutos} minutos e {segundos} segundos")
+    corpus = process_corpus(args.corpus_csv_file, args.clean, args.begin_point, args.column)
 
+    if corpus is not None:
+        content = corpus[args.column].tolist()
+        generate_embeddings(args.corpus_csv_file, corpus['index'], content, corpus['num_tema_cadastrado'], args.model, args.verbose, args.clean, args.data_type)
+
+        total_time = time.time() - start_time
+        minutes, seconds = divmod(int(total_time), 60)
+
+        log_file = 'log_embedding_clean.txt' if args.clean else 'log_embedding.txt'
+        with open(log_file, 'w') as log:
+            log.write("############### Corpus Embedding Creation ###############\n")
+            log.write(f"Sentence-BERT Model: {args.model}\n")
+            log.write(f"Removing stopwords: {'Yes' if args.clean else 'No'}\n")
+            log.write(f"Total execution time: {minutes} minutes and {seconds} seconds\n")
 
-if __name__=="__main__":
-
-    parser = argparse.ArgumentParser(description='Generate text embedding using Sentence-BERT model')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Generate text embeddings using Sentence-BERT')
     parser.add_argument('corpus_csv_file', type=argparse.FileType('r'), help='File containing the corpus')
-    parser.add_argument('data_type',choices=['recurso','tema'],help='Indicates whether the data type is recurso or tema')
-    parser.add_argument('column',help='Column that contains the text to be transformed into embedding')
-    parser.add_argument('model',default='distiluse-base-multilingual-cased-v1',nargs='?', help='The Sentence-BERT model used to generate embedding : Default = distiluse-base-multilingual-cased-v1')
-    parser.add_argument('--clean',action='store_true',help='Remove stopwords before creating embedding')
-    parser.add_argument('--begin_point',help='Word that marks the beginning of essential part of the text')
-    parser.add_argument('-v','--verbose',action='store_true',help='Increase the verbosity level')
+    parser.add_argument('data_type', choices=['recurso', 'tema'], help='Indicates whether the data type is "recurso" or "tema"')
+    parser.add_argument('column', help='Column with text to be converted into embeddings')
+    parser.add_argument('model', default='distiluse-base-multilingual-cased-v1', nargs='?', help='Sentence-BERT model used for embedding generation')
+    parser.add_argument('--clean', action='store_true', help='Remove stopwords before creating embeddings')
+    parser.add_argument('--begin_point', help='Keyword marking the start of the relevant text')
+    parser.add_argument('-v', '--verbose', action='store_true', help='Increase verbosity')
+
     args = parser.parse_args()
     main(args)
-
-
-