diff --git a/src/createEmbedding.py b/src/createEmbedding.py index c734993..2235dcc 100644 --- a/src/createEmbedding.py +++ b/src/createEmbedding.py @@ -14,206 +14,143 @@ import string import torch - verbose = False - -def select_important(doc,mark): +def extract_relevant_text(doc, marker): doc = doc.lower() - match = re.search(fr' {mark}[^\n]*',doc) + match = re.search(fr' {marker}[^\n]*', doc) if match: start = match.start() final_doc = doc[start:] else: final_doc = doc - return final_doc - #Palavras irrelevantes + return final_doc + def remove_punctuation(text): translator = str.maketrans('', '', string.punctuation) - text_without_punctuation = text.translate(translator) - return text_without_punctuation + return text.translate(translator) def remove_stopwords(text): stop_words = set(stopwords.words('portuguese')) - stop_words.update(["nº","cep","telefone","rua","avenida","endereço","fax","fones"]) - stop_words.update(["egrégia","egrégio","eg","e.g."]) + stop_words.update(["nº", "cep", "telefone", "rua", "avenida", "endereço", "fax", "fones"]) + stop_words.update(["egrégia", "egrégio", "eg", "e.g."]) + stop_words.update(["copy", "reg", "trade", "ldquo", "rdquo", "lsquo", "rsquo", "bull", "middot", "sdot", "ndash"]) + stop_words.update(["mdash", "cent", "pound", "euro", "ne", "frac12", "frac14", "frac34", "deg", "larr", "rarr", "uarr", "darr", "egrave", "eacute", "ccedil", "hellip"]) - stop_words.update(["copy","reg","trade","ldquo","rdquo","lsquo","rsquo","bull","middot","sdot","ndash"]) - stop_words.update(["mdash","cent","pound","euro","ne","frac12","frac14","frac34","deg","larr","rarr","uarr","darr","egrave","eacute","ccedil","hellip"]) - # Tokenizando o texto tokens = word_tokenize(text, language='portuguese') tokens_cleaned = [token for token in tokens if token not in stop_words] detokenizer = TreebankWordDetokenizer() - detokenized_text = detokenizer.detokenize(tokens_cleaned) - text_cleaned = ' '.join(tokens_cleaned) - return text_cleaned + return detokenizer.detokenize(tokens_cleaned) - def clean_text(doc): - final_doc = doc - #Padrões irrelevantes - match_pattern = [r',\s*,',r'\bpágina\s+(\d+)\s+(\d+)\b',r'\bpágina\s+(\d+)\s+de\s+(\d+)\b',r'\?',r'\b_+(?:\d+|[a-zA-Z]+)?\b',r'https?://\S+',r'www\.\S+',r'\S+@\S+',r'^\d{3}.\d{3}.\d{3}-\d{2}$',r'^\d{2}\.\d{3}\.\d{3}\/\d{4}\-\d{2}$',r'\d{2}/\d{2}/\d{4}[ ,]',r'\bprocuradoria regional (federal|da união) da \d+[ªa] região\b',r'\btribunal regional federal( da) \d+[ªa] região\b',r'\badvocacia( -)geral da união\b',r'\b(excelentíssimo|senhor|vice-presidente|desembargador|\(a\))\b',r'\bprocuradoria[ -]geral federal\b',r'\bescritório de advocacia\b',r'\b(superior) tribunal de justiça\b',r'\bsupremo tribunal federal\b',r'\bfones\b',r'\bfax\b'] - subs = [''] * len(match_pattern) - for match_pattern, subs in zip(match_pattern,subs): - final_doc = re.sub(match_pattern,subs,final_doc) + match_patterns = [ + r',\s*,', r'\bpágina\s+(\d+)\s+(\d+)\b', r'\bpágina\s+(\d+)\s+de\s+(\d+)\b', r'\?', r'\b_+(?:\d+|[a-zA-Z]+)?\b', + r'https?://\S+', r'www\.\S+', r'\S+@\S+', r'^\d{3}.\d{3}.\d{3}-\d{2}$', r'^\d{2}\.\d{3}\.\d{3}\/\d{4}\-\d{2}$', + r'\d{2}/\d{2}/\d{4}[ ,]', r'\bprocuradoria regional (federal|da união) da \d+[ªa] região\b', + r'\btribunal regional federal( da) \d+[ªa] região\b', r'\badvocacia( -)geral da união\b', + r'\b(excelentíssimo|senhor|vice-presidente|desembargador|\(a\))\b', r'\bprocuradoria[ -]geral federal\b', + r'\bescritório de advocacia\b', r'\b(superior) tribunal de justiça\b', r'\bsupremo tribunal federal\b', + r'\bfones\b', r'\bfax\b' + ] - final_doc = remove_stopwords(final_doc) - - return final_doc + final_doc = doc + for pattern in match_patterns: + final_doc = re.sub(pattern, '', final_doc) -def process_corpus(path,clean,begin,column): + return remove_stopwords(final_doc) + +def process_corpus(file_path, clean, begin_point, column): try: - resp = pd.read_csv(path) - size = len(resp) - docs = [] - num_cadastrado = [] - indice = [] - text = "" - print("############### Lendo registros do corpus ###############") - for i, linha in resp.iterrows(): - #Exibe percentual concluido - sys.stdout.write("\r ") # \r faz o cursor retroceder ao início da linha - sys.stdout.write(f' Percentual concluído: {i/size*100:.2f}%') - sys.stdout.flush() # Força a impressão imediata - tipo = type(linha[column]) - text = linha[column] + data = pd.read_csv(file_path) + size = len(data) + docs, registered_ids, indices = [], [], [] + print("############### Reading corpus records ###############") + + for i, row in data.iterrows(): + sys.stdout.write(f'\r Progress: {i/size*100:.2f}%') + sys.stdout.flush() + text = row[column] try: - #tratamento necessário, textos estavam sendo identificados como float - if (tipo == str): - #Se houver um marco de trecho relevante do texto - if begin: - text = select_important(text,begin) - #print(text) - #Se tiver de limpar o texto + if isinstance(text, str): + if begin_point: + text = extract_relevant_text(text, begin_point) if clean: - text_cleaned = clean_text(text) - num_cadastrado.append(int(linha['num_tema_cadastrado'])) - #print(text_cleaned) - docs.append(text_cleaned) - indice.append(i) - else: - num_cadastrado.append(int(linha['num_tema_cadastrado'])) - docs.append(text) - indice.append(i) - - except Exception as erro: - print(f"Erro ao capturar numero de tema cadastrado {i} : {erro}") + text = clean_text(text) + + registered_ids.append(int(row['num_tema_cadastrado'])) + docs.append(text) + indices.append(i) + except Exception as error: + print(f"Error processing record {i}: {error}") continue - corpus = pd.DataFrame() - corpus["indice"]=indice - corpus["num_tema_cadastrado"]=num_cadastrado - corpus[column]=docs + corpus_df = pd.DataFrame() + corpus_df["index"] = indices + corpus_df["num_tema_cadastrado"] = registered_ids + corpus_df[column] = docs + return corpus_df - except FileNotFoundError: - print(f"Error: File {path} not found") - except PermissionError: - print(f"Error: Permission denied for {path}") - except pd.errors.EmptyDataError: - print(f"Error: The path '{path}' is empty.") - except pd.errors.ParserError as e: - print(f"Error processing file: {e}") - - return corpus - + except (FileNotFoundError, PermissionError, pd.errors.EmptyDataError, pd.errors.ParserError) as e: + print(f"Error: {e}") + return None -def createFileName(name,clean): - if clean: - return f'{name}_EMBEDDING_CLEAN.pkl' - else: - return f'{name}_EMBEDDING.pkl' +def create_file_name(base_name, clean): + return f'{base_name}_EMBEDDING_CLEAN.pkl' if clean else f'{base_name}_EMBEDDING.pkl' -def create_embedding(file,indice,corpus,num,model,v,c,data_type): +def generate_embeddings(file, index, corpus, labels, model_name, verbose, clean, data_type): + sentence_model = SentenceTransformer(model_name) + embedding_file = create_file_name(file.name.split('.')[0], clean) - sentence_model = SentenceTransformer(model) - nameEmbedding = createFileName(file.name.split('.')[0],c) - if(data_type == 'tema'): - print("############### Gerando embeddings dos temas ###############") - i = 0 - for linha in corpus: - i = i+1 - #Exibe percentual concluido - sys.stdout.write("\r ") # \r faz o cursor retroceder ao início da linha - sys.stdout.write(f' Percentual concluído: {i/len(corpus)*100:.2f}%') - sys.stdout.flush() # Força a impressão imediata - tema_clean = remove_punctuation(linha) - tokenized_temas = tema_clean.split(" ") - #verbose - if v: - corpus_embedding = sentence_model.encode(corpus,show_progress_bar=True) - else: - corpus_embedding = sentence_model.encode(corpus,show_progress_bar=False) - - with open(nameEmbedding, "wb") as fOut: - pickle.dump({'indice':indice,'sentences': corpus,'numTema':num ,'embeddings': corpus_embedding}, fOut,protocol=pickle.HIGHEST_PROTOCOL) - print(f"Embedding salvo no arquivo {nameEmbedding}") + if data_type == 'tema': + print("############### Generating embeddings for topics ###############") + + if verbose: + corpus_embeddings = sentence_model.encode(corpus, show_progress_bar=True) else: - #verbose - if v: - corpus_embedding = sentence_model.encode(corpus,show_progress_bar=True) - else: - corpus_embedding = sentence_model.encode(corpus,show_progress_bar=False) + corpus_embeddings = sentence_model.encode(corpus, show_progress_bar=False) - with open(nameEmbedding, "wb") as fOut: - pickle.dump({'indice':indice,'sentences': corpus,'numTema':num ,'embeddings': corpus_embedding}, fOut,protocol=pickle.HIGHEST_PROTOCOL) - print(f"Embedding salvo no arquivo {nameEmbedding}") + with open(embedding_file, "wb") as fOut: + pickle.dump({'index': index, 'sentences': corpus, 'numTema': labels, 'embeddings': corpus_embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL) + print(f"Embeddings saved to {embedding_file}") + def main(args): - conteudo = [] - print("############### PROGRAMA DE GERAÇÃO DE EMBEDDINGS ###############") - print("############### Configuração ###############") - print(f"Sentence-BERT Model : {args.model}") - print(f"Remoção de stopwords : {args.clean}") + print("############### EMBEDDING GENERATION PROGRAM ###############") + print(f"Sentence-BERT Model: {args.model}") + print(f"Removing stopwords: {args.clean}") + if args.begin_point: - print(f"Considerar texto após primeira ocorrência da palavra: {args.begin_point}") - tempo_inicio = time.time() - print("\n\n############### Fazendo download de dependências ... ###############") + print(f"Processing text after the first occurrence of: {args.begin_point}") + + start_time = time.time() nltk.download('stopwords') nltk.download('punkt') - verbose = args.verbose - corpus = process_corpus(args.corpus_csv_file,args.clean,args.begin_point,args.column) - - #Se não houver uma coluna indice, cria uma - #if 'indice' not in corpus.columns: - #corpus['indice'] = range(1,len(corpus)+1) - - conteudo = corpus[args.column].tolist() - create_embedding(args.corpus_csv_file,corpus['indice'],conteudo,corpus['num_tema_cadastrado'],args.model,args.verbose,args.clean, args.data_type) - print("Salvando log ...") - tempo_fim = time.time() - tempo_total_segundos = tempo_fim - tempo_inicio - minutos, segundos = divmod(int(tempo_total_segundos), 60) - if args.clean: - log = 'log_corpus_embedding_clean.txt' - with open(log, 'w') as arquivo: - # Escrevendo os dados no arquivo - arquivo.write("############### Criação de Embedding do Corpus ###############") - arquivo.write(f"Sentence-BERT Model : {args.model}") - arquivo.write("Remoção de stopwords : Sim") - arquivo.write(f"Tempo total de execução: {minutos} minutos e {segundos} segundos") - else: - log = 'log_corpus_embedding.txt' - with open(log, 'w') as arquivo: - # Escrevendo os dados no arquivo - arquivo.write("############### Criação de Embedding do Corpus ###############") - arquivo.write(f"Sentence-BERT Model : {args.model}") - arquivo.write("Remoção de stopwords : Não") - arquivo.write(f"Tempo total de execução: {minutos} minutos e {segundos} segundos") + corpus = process_corpus(args.corpus_csv_file, args.clean, args.begin_point, args.column) + if corpus is not None: + content = corpus[args.column].tolist() + generate_embeddings(args.corpus_csv_file, corpus['index'], content, corpus['num_tema_cadastrado'], args.model, args.verbose, args.clean, args.data_type) + + total_time = time.time() - start_time + minutes, seconds = divmod(int(total_time), 60) + + log_file = 'log_embedding_clean.txt' if args.clean else 'log_embedding.txt' + with open(log_file, 'w') as log: + log.write("############### Corpus Embedding Creation ###############\n") + log.write(f"Sentence-BERT Model: {args.model}\n") + log.write(f"Removing stopwords: {'Yes' if args.clean else 'No'}\n") + log.write(f"Total execution time: {minutes} minutes and {seconds} seconds\n") -if __name__=="__main__": - - parser = argparse.ArgumentParser(description='Generate text embedding using Sentence-BERT model') +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Generate text embeddings using Sentence-BERT') parser.add_argument('corpus_csv_file', type=argparse.FileType('r'), help='File containing the corpus') - parser.add_argument('data_type',choices=['recurso','tema'],help='Indicates whether the data type is recurso or tema') - parser.add_argument('column',help='Column that contains the text to be transformed into embedding') - parser.add_argument('model',default='distiluse-base-multilingual-cased-v1',nargs='?', help='The Sentence-BERT model used to generate embedding : Default = distiluse-base-multilingual-cased-v1') - parser.add_argument('--clean',action='store_true',help='Remove stopwords before creating embedding') - parser.add_argument('--begin_point',help='Word that marks the beginning of essential part of the text') - parser.add_argument('-v','--verbose',action='store_true',help='Increase the verbosity level') + parser.add_argument('data_type', choices=['recurso', 'tema'], help='Indicates whether the data type is "recurso" or "tema"') + parser.add_argument('column', help='Column with text to be converted into embeddings') + parser.add_argument('model', default='distiluse-base-multilingual-cased-v1', nargs='?', help='Sentence-BERT model used for embedding generation') + parser.add_argument('--clean', action='store_true', help='Remove stopwords before creating embeddings') + parser.add_argument('--begin_point', help='Keyword marking the start of the relevant text') + parser.add_argument('-v', '--verbose', action='store_true', help='Increase verbosity') + args = parser.parse_args() main(args) - - -