Skip to content

Commit

Permalink
Update createTopics.py
Browse files Browse the repository at this point in the history
  • Loading branch information
gregoriofsg authored Dec 13, 2023
1 parent 08f755d commit bde0bbb
Showing 1 changed file with 27 additions and 20 deletions.
47 changes: 27 additions & 20 deletions createTopics.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import argparse
import time
from sentence_transformers import SentenceTransformer
from sentence_transformers import SentenceTransformer, util
import numpy as np
from bertopic import BERTopic
from abc import ABC, abstractmethod
import pickle
import numpy as np
import pandas as pd
import csv
import sys
import nltk
from LexRank import degree_centrality_scores


#Implementando padrao de projeto strategy
Expand All @@ -18,14 +20,15 @@ class Estrategia(ABC):
def executar(self, corpus_embedding, model, seed_list=None,verbose=None):
pass

def createFileName(stg,fen,sz):
@classmethod
def createFileName(cls, stg, fen, sz):
if 'clean' in fen:
return 'TOPICS_' + stg + sz + 'CLEAN.pkl'
else
return 'TOPICS_' + stg + sz + '.pkl'
return f'TOPICS_{stg}{sz}CLEAN.pkl'
else:
return f'TOPICS_{stg}{sz}.pkl'

def saveTopicFile(self,strategy,size,fileEmbeddingName,indice, numTema,topics):
name=createFileName(strategy,fileEmbeddingName,size)
name=self.createFileName(strategy,fileEmbeddingName,size)
with open(name, "wb") as fOut:
pickle.dump({'indice':indice,'topics': topics,'numTema':numTema}, fOut,protocol=pickle.HIGHEST_PROTOCOL)

Expand All @@ -43,7 +46,7 @@ def executar(self, corpus_embedding,size, model, seed_list=None,verbose=None):
topic_model = BERTopic(embedding_model=model,top_n_words=size,verbose=verbose)
topics, probs = topic_model.fit_transform(stored_sentences,stored_embeddings)
representacao = topic_model.get_document_info(stored_sentences)
saveTopicFile('B',size,corpus_embedding,stored_indice,stored_number,representacao['Top_n_words'])
self.saveTopicFile('B',size,corpus_embedding,stored_indice,stored_number,representacao['Top_n_words'])

class EstrategiaBertopicGuided(Estrategia):
def executar(self, corpus_embedding,size, model, seed_list=None,verbose=None):
Expand All @@ -57,7 +60,7 @@ def executar(self, corpus_embedding,size, model, seed_list=None,verbose=None):
topic_model = BERTopic(embedding_model=model,top_n_words=size,verbose=verbose,seed_topic_list=seed_list)
topics, probs = topic_model.fit_transform(stored_sentences,stored_embeddings)
representacao = topic_model.get_document_info(stored_sentences)
saveTopicFile('G',size,corpus_embedding,stored_indice,stored_number,representacao['Top_n_words'])
self.saveTopicFile('G',size,corpus_embedding,stored_indice,stored_number,representacao['Top_n_words'])

class EstrategiaLexrank(Estrategia):
def executar(self, corpus_embedding,size, model, seed_list=None,verbose=None):
Expand All @@ -74,16 +77,14 @@ def executar(self, corpus_embedding,size, model, seed_list=None,verbose=None):
stored_embeddings = stored_data['embeddings']
stored_number = stored_data['numTema']
print("Executando Estratégia Lexrank")
#topic_model = BERTopic(embedding_model=model,top_n_words=size,verbose=verbose)
#topics, probs = topic_model.fit_transform(stored_sentences,stored_embeddings)
#representacao = topic_model.get_document_info(stored_sentences)


for text,embeddings in zip(stored_sentences,stored_embeddings):
topics = []
summary = ""

#Split the document into sentences
sentences = nltk.sent_tokenize(text)

#print("Num sentences:", len(sentences))

#Compute the sentence embeddings
Expand All @@ -100,20 +101,26 @@ def executar(self, corpus_embedding,size, model, seed_list=None,verbose=None):

#Print the 5 sentences with the highest scores
#print("\n\nSummary:")
for idx in most_central_sentence_indices[0:10]:
topics.append(text[idx].strip())
for idx in most_central_sentence_indices[0:size]:
topics.append(sentences[idx].strip())
summary = "".join(topics)
representacao.append(summary)
sys.stdout.write("\r ") # \r faz o cursor retroceder ao início da linha
sys.stdout.write(f' Percentual concluído: {len(representacao)/len(stored_indice)*100:.2f}%')
sys.stdout.flush() # Força a impressão imediata
print(" ", end='\r')

#print(representacao[1])

saveTopicFile('L',size,corpus_embedding,stored_indice,stored_number,representacao)
self.saveTopicFile('L',size,corpus_embedding,stored_indice,stored_number,representacao)

# Classe que usa uma estratégia
class Contexto:
def __init__(self, estrategia):
self.estrategia = estrategia

def executar_estrategia(self, corpus_embedding, model, seed_list=None,verbose=None):
self.estrategia.executar(corpus_embedding, model, seed_list,verbose)
def executar_estrategia(self, corpus_embedding, size, model, seed_list=None, verbose=None):
self.estrategia.executar(corpus_embedding, size, model, seed_list, verbose)

def main(args):
print("############### PROGRAMA DE GERAÇÃO DE TÓPICOS ###############")
Expand All @@ -135,7 +142,7 @@ def main(args):
return

contexto = Contexto(estrategia)
contexto.executar_estrategia(args.corpus_embedding,args.size, args.model, args.seed_list,verbose)
contexto.executar_estrategia(args.corpus_embedding,int(args.size), args.model, args.seed_list,verbose)

print("Salvando log ...")
tempo_fim = time.time()
Expand Down

0 comments on commit bde0bbb

Please sign in to comment.