Skip to content

Commit

Permalink
Update createEmbedding.py
Browse files Browse the repository at this point in the history
  • Loading branch information
gregoriofsg authored Sep 9, 2024
1 parent 0d91fd8 commit dfcdbf5
Showing 1 changed file with 97 additions and 160 deletions.
257 changes: 97 additions & 160 deletions src/createEmbedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,206 +14,143 @@
import string
import torch


verbose = False


def select_important(doc,mark):
def extract_relevant_text(doc, marker):
doc = doc.lower()
match = re.search(fr' {mark}[^\n]*',doc)
match = re.search(fr' {marker}[^\n]*', doc)
if match:
start = match.start()
final_doc = doc[start:]
else:
final_doc = doc
return final_doc
#Palavras irrelevantes
return final_doc

def remove_punctuation(text):
translator = str.maketrans('', '', string.punctuation)
text_without_punctuation = text.translate(translator)
return text_without_punctuation
return text.translate(translator)

def remove_stopwords(text):
stop_words = set(stopwords.words('portuguese'))
stop_words.update(["nº","cep","telefone","rua","avenida","endereço","fax","fones"])
stop_words.update(["egrégia","egrégio","eg","e.g."])
stop_words.update(["nº", "cep", "telefone", "rua", "avenida", "endereço", "fax", "fones"])
stop_words.update(["egrégia", "egrégio", "eg", "e.g."])
stop_words.update(["copy", "reg", "trade", "ldquo", "rdquo", "lsquo", "rsquo", "bull", "middot", "sdot", "ndash"])
stop_words.update(["mdash", "cent", "pound", "euro", "ne", "frac12", "frac14", "frac34", "deg", "larr", "rarr", "uarr", "darr", "egrave", "eacute", "ccedil", "hellip"])

stop_words.update(["copy","reg","trade","ldquo","rdquo","lsquo","rsquo","bull","middot","sdot","ndash"])
stop_words.update(["mdash","cent","pound","euro","ne","frac12","frac14","frac34","deg","larr","rarr","uarr","darr","egrave","eacute","ccedil","hellip"])
# Tokenizando o texto
tokens = word_tokenize(text, language='portuguese')
tokens_cleaned = [token for token in tokens if token not in stop_words]
detokenizer = TreebankWordDetokenizer()
detokenized_text = detokenizer.detokenize(tokens_cleaned)
text_cleaned = ' '.join(tokens_cleaned)
return text_cleaned
return detokenizer.detokenize(tokens_cleaned)


def clean_text(doc):
final_doc = doc
#Padrões irrelevantes
match_pattern = [r',\s*,',r'\bpágina\s+(\d+)\s+(\d+)\b',r'\bpágina\s+(\d+)\s+de\s+(\d+)\b',r'\?',r'\b_+(?:\d+|[a-zA-Z]+)?\b',r'https?://\S+',r'www\.\S+',r'\S+@\S+',r'^\d{3}.\d{3}.\d{3}-\d{2}$',r'^\d{2}\.\d{3}\.\d{3}\/\d{4}\-\d{2}$',r'\d{2}/\d{2}/\d{4}[ ,]',r'\bprocuradoria regional (federal|da união) da \d+[ªa] região\b',r'\btribunal regional federal( da) \d+[ªa] região\b',r'\badvocacia( -)geral da união\b',r'\b(excelentíssimo|senhor|vice-presidente|desembargador|\(a\))\b',r'\bprocuradoria[ -]geral federal\b',r'\bescritório de advocacia\b',r'\b(superior) tribunal de justiça\b',r'\bsupremo tribunal federal\b',r'\bfones\b',r'\bfax\b']
subs = [''] * len(match_pattern)
for match_pattern, subs in zip(match_pattern,subs):
final_doc = re.sub(match_pattern,subs,final_doc)
match_patterns = [
r',\s*,', r'\bpágina\s+(\d+)\s+(\d+)\b', r'\bpágina\s+(\d+)\s+de\s+(\d+)\b', r'\?', r'\b_+(?:\d+|[a-zA-Z]+)?\b',
r'https?://\S+', r'www\.\S+', r'\S+@\S+', r'^\d{3}.\d{3}.\d{3}-\d{2}$', r'^\d{2}\.\d{3}\.\d{3}\/\d{4}\-\d{2}$',
r'\d{2}/\d{2}/\d{4}[ ,]', r'\bprocuradoria regional (federal|da união) da \d+[ªa] região\b',
r'\btribunal regional federal( da) \d+[ªa] região\b', r'\badvocacia( -)geral da união\b',
r'\b(excelentíssimo|senhor|vice-presidente|desembargador|\(a\))\b', r'\bprocuradoria[ -]geral federal\b',
r'\bescritório de advocacia\b', r'\b(superior) tribunal de justiça\b', r'\bsupremo tribunal federal\b',
r'\bfones\b', r'\bfax\b'
]

final_doc = remove_stopwords(final_doc)

return final_doc
final_doc = doc
for pattern in match_patterns:
final_doc = re.sub(pattern, '', final_doc)

def process_corpus(path,clean,begin,column):
return remove_stopwords(final_doc)

def process_corpus(file_path, clean, begin_point, column):
try:
resp = pd.read_csv(path)
size = len(resp)
docs = []
num_cadastrado = []
indice = []
text = ""
print("############### Lendo registros do corpus ###############")
for i, linha in resp.iterrows():
#Exibe percentual concluido
sys.stdout.write("\r ") # \r faz o cursor retroceder ao início da linha
sys.stdout.write(f' Percentual concluído: {i/size*100:.2f}%')
sys.stdout.flush() # Força a impressão imediata
tipo = type(linha[column])
text = linha[column]
data = pd.read_csv(file_path)
size = len(data)
docs, registered_ids, indices = [], [], []
print("############### Reading corpus records ###############")

for i, row in data.iterrows():
sys.stdout.write(f'\r Progress: {i/size*100:.2f}%')
sys.stdout.flush()
text = row[column]
try:
#tratamento necessário, textos estavam sendo identificados como float
if (tipo == str):
#Se houver um marco de trecho relevante do texto
if begin:
text = select_important(text,begin)
#print(text)
#Se tiver de limpar o texto
if isinstance(text, str):
if begin_point:
text = extract_relevant_text(text, begin_point)
if clean:
text_cleaned = clean_text(text)
num_cadastrado.append(int(linha['num_tema_cadastrado']))
#print(text_cleaned)
docs.append(text_cleaned)
indice.append(i)
else:
num_cadastrado.append(int(linha['num_tema_cadastrado']))
docs.append(text)
indice.append(i)

except Exception as erro:
print(f"Erro ao capturar numero de tema cadastrado {i} : {erro}")
text = clean_text(text)

registered_ids.append(int(row['num_tema_cadastrado']))
docs.append(text)
indices.append(i)
except Exception as error:
print(f"Error processing record {i}: {error}")
continue

corpus = pd.DataFrame()
corpus["indice"]=indice
corpus["num_tema_cadastrado"]=num_cadastrado
corpus[column]=docs
corpus_df = pd.DataFrame()
corpus_df["index"] = indices
corpus_df["num_tema_cadastrado"] = registered_ids
corpus_df[column] = docs
return corpus_df

except FileNotFoundError:
print(f"Error: File {path} not found")
except PermissionError:
print(f"Error: Permission denied for {path}")
except pd.errors.EmptyDataError:
print(f"Error: The path '{path}' is empty.")
except pd.errors.ParserError as e:
print(f"Error processing file: {e}")

return corpus

except (FileNotFoundError, PermissionError, pd.errors.EmptyDataError, pd.errors.ParserError) as e:
print(f"Error: {e}")
return None

def createFileName(name,clean):
if clean:
return f'{name}_EMBEDDING_CLEAN.pkl'
else:
return f'{name}_EMBEDDING.pkl'
def create_file_name(base_name, clean):
return f'{base_name}_EMBEDDING_CLEAN.pkl' if clean else f'{base_name}_EMBEDDING.pkl'

def create_embedding(file,indice,corpus,num,model,v,c,data_type):
def generate_embeddings(file, index, corpus, labels, model_name, verbose, clean, data_type):
sentence_model = SentenceTransformer(model_name)
embedding_file = create_file_name(file.name.split('.')[0], clean)

sentence_model = SentenceTransformer(model)
nameEmbedding = createFileName(file.name.split('.')[0],c)
if(data_type == 'tema'):
print("############### Gerando embeddings dos temas ###############")
i = 0
for linha in corpus:
i = i+1
#Exibe percentual concluido
sys.stdout.write("\r ") # \r faz o cursor retroceder ao início da linha
sys.stdout.write(f' Percentual concluído: {i/len(corpus)*100:.2f}%')
sys.stdout.flush() # Força a impressão imediata
tema_clean = remove_punctuation(linha)
tokenized_temas = tema_clean.split(" ")
#verbose
if v:
corpus_embedding = sentence_model.encode(corpus,show_progress_bar=True)
else:
corpus_embedding = sentence_model.encode(corpus,show_progress_bar=False)

with open(nameEmbedding, "wb") as fOut:
pickle.dump({'indice':indice,'sentences': corpus,'numTema':num ,'embeddings': corpus_embedding}, fOut,protocol=pickle.HIGHEST_PROTOCOL)
print(f"Embedding salvo no arquivo {nameEmbedding}")
if data_type == 'tema':
print("############### Generating embeddings for topics ###############")

if verbose:
corpus_embeddings = sentence_model.encode(corpus, show_progress_bar=True)
else:
#verbose
if v:
corpus_embedding = sentence_model.encode(corpus,show_progress_bar=True)
else:
corpus_embedding = sentence_model.encode(corpus,show_progress_bar=False)
corpus_embeddings = sentence_model.encode(corpus, show_progress_bar=False)

with open(nameEmbedding, "wb") as fOut:
pickle.dump({'indice':indice,'sentences': corpus,'numTema':num ,'embeddings': corpus_embedding}, fOut,protocol=pickle.HIGHEST_PROTOCOL)
print(f"Embedding salvo no arquivo {nameEmbedding}")
with open(embedding_file, "wb") as fOut:
pickle.dump({'index': index, 'sentences': corpus, 'numTema': labels, 'embeddings': corpus_embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

print(f"Embeddings saved to {embedding_file}")

def main(args):
conteudo = []
print("############### PROGRAMA DE GERAÇÃO DE EMBEDDINGS ###############")
print("############### Configuração ###############")
print(f"Sentence-BERT Model : {args.model}")
print(f"Remoção de stopwords : {args.clean}")
print("############### EMBEDDING GENERATION PROGRAM ###############")
print(f"Sentence-BERT Model: {args.model}")
print(f"Removing stopwords: {args.clean}")

if args.begin_point:
print(f"Considerar texto após primeira ocorrência da palavra: {args.begin_point}")
tempo_inicio = time.time()
print("\n\n############### Fazendo download de dependências ... ###############")
print(f"Processing text after the first occurrence of: {args.begin_point}")

start_time = time.time()
nltk.download('stopwords')
nltk.download('punkt')
verbose = args.verbose
corpus = process_corpus(args.corpus_csv_file,args.clean,args.begin_point,args.column)

#Se não houver uma coluna indice, cria uma
#if 'indice' not in corpus.columns:
#corpus['indice'] = range(1,len(corpus)+1)

conteudo = corpus[args.column].tolist()
create_embedding(args.corpus_csv_file,corpus['indice'],conteudo,corpus['num_tema_cadastrado'],args.model,args.verbose,args.clean, args.data_type)

print("Salvando log ...")
tempo_fim = time.time()
tempo_total_segundos = tempo_fim - tempo_inicio
minutos, segundos = divmod(int(tempo_total_segundos), 60)
if args.clean:
log = 'log_corpus_embedding_clean.txt'
with open(log, 'w') as arquivo:
# Escrevendo os dados no arquivo
arquivo.write("############### Criação de Embedding do Corpus ###############")
arquivo.write(f"Sentence-BERT Model : {args.model}")
arquivo.write("Remoção de stopwords : Sim")
arquivo.write(f"Tempo total de execução: {minutos} minutos e {segundos} segundos")
else:
log = 'log_corpus_embedding.txt'
with open(log, 'w') as arquivo:
# Escrevendo os dados no arquivo
arquivo.write("############### Criação de Embedding do Corpus ###############")
arquivo.write(f"Sentence-BERT Model : {args.model}")
arquivo.write("Remoção de stopwords : Não")
arquivo.write(f"Tempo total de execução: {minutos} minutos e {segundos} segundos")
corpus = process_corpus(args.corpus_csv_file, args.clean, args.begin_point, args.column)

if corpus is not None:
content = corpus[args.column].tolist()
generate_embeddings(args.corpus_csv_file, corpus['index'], content, corpus['num_tema_cadastrado'], args.model, args.verbose, args.clean, args.data_type)

total_time = time.time() - start_time
minutes, seconds = divmod(int(total_time), 60)

log_file = 'log_embedding_clean.txt' if args.clean else 'log_embedding.txt'
with open(log_file, 'w') as log:
log.write("############### Corpus Embedding Creation ###############\n")
log.write(f"Sentence-BERT Model: {args.model}\n")
log.write(f"Removing stopwords: {'Yes' if args.clean else 'No'}\n")
log.write(f"Total execution time: {minutes} minutes and {seconds} seconds\n")

if __name__=="__main__":

parser = argparse.ArgumentParser(description='Generate text embedding using Sentence-BERT model')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Generate text embeddings using Sentence-BERT')
parser.add_argument('corpus_csv_file', type=argparse.FileType('r'), help='File containing the corpus')
parser.add_argument('data_type',choices=['recurso','tema'],help='Indicates whether the data type is recurso or tema')
parser.add_argument('column',help='Column that contains the text to be transformed into embedding')
parser.add_argument('model',default='distiluse-base-multilingual-cased-v1',nargs='?', help='The Sentence-BERT model used to generate embedding : Default = distiluse-base-multilingual-cased-v1')
parser.add_argument('--clean',action='store_true',help='Remove stopwords before creating embedding')
parser.add_argument('--begin_point',help='Word that marks the beginning of essential part of the text')
parser.add_argument('-v','--verbose',action='store_true',help='Increase the verbosity level')
parser.add_argument('data_type', choices=['recurso', 'tema'], help='Indicates whether the data type is "recurso" or "tema"')
parser.add_argument('column', help='Column with text to be converted into embeddings')
parser.add_argument('model', default='distiluse-base-multilingual-cased-v1', nargs='?', help='Sentence-BERT model used for embedding generation')
parser.add_argument('--clean', action='store_true', help='Remove stopwords before creating embeddings')
parser.add_argument('--begin_point', help='Keyword marking the start of the relevant text')
parser.add_argument('-v', '--verbose', action='store_true', help='Increase verbosity')

args = parser.parse_args()
main(args)



0 comments on commit dfcdbf5

Please sign in to comment.