-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3 LDA.py
90 lines (70 loc) · 2.7 KB
/
3 LDA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
# 2023 Alcohol Industry Economic Operation Report.txt
import re
import gensim
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from gensim import corpora
from gensim.models.ldamodel import LdaModel
# Load the text file
file_path = "2023 Alcohol Industry Economic Operation Report.txt"
with open(file_path, 'r', encoding='utf-8') as f:
documents = f.readlines()
# Function to remove specific keywords
def remove_keywords(text):
keywords = ['Abstract', 'Keywords', 'SEP']
for kw in keywords:
text = re.sub(r'\b' + kw + r'\b', '', text)
return text
# Preprocess text, removing stopwords and non-alphabetic words
def preprocess_text(text):
text = remove_keywords(text)
return [word for word in gensim.utils.simple_preprocess(text)
if word not in ENGLISH_STOP_WORDS and word.isalpha()]
# Preprocess all documents
processed_docs = [preprocess_text(doc) for doc in documents]
# Create dictionary and document-term matrix
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
# Number of topics
num_topics = 7
# Train LDA model using gensim
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics,
passes=50, random_state=3154)
# Print topics
topicWordProbMat = lda_model.show_topics(num_topics=num_topics, num_words=10,
formatted=False)
# Create the DataFrame for the heatmap
columns = ['Topic ' + str(x) for x in range(1, num_topics + 1)]
df = pd.DataFrame(columns=columns)
DC = {} # Dictionary to map words to row indices
zz = np.zeros((100, num_topics))
last_number = 0
# Populate the DataFrame and the probability matrix
for topic_id, words_probs in topicWordProbMat:
for word, prob in words_probs:
word = word.strip()
if word in DC:
zz[DC[word], topic_id] = prob
else:
zz[last_number, topic_id] = prob
DC[word] = last_number
last_number += 1
# Resize the matrix to match the actual number of words
zz = np.resize(zz, (len(DC.keys()), zz.shape[1]))
# Plotting the heatmap
plt.figure(figsize=(20, 10))
plt.imshow(zz, cmap='rainbow', interpolation='nearest')
# Annotate the heatmap with words
for val, key in enumerate(DC.keys()):
plt.text(-2.5, val + 0.5, key, horizontalalignment='center',
verticalalignment='center')
# Set title and remove y-ticks (since we'll annotate manually)
plt.title("Heatmap of Topic-Word Probabilities")
plt.yticks([])
# Save the heatmap to a file
plt.savefig("heatmap_wine.png", transparent=True, dpi=400)
# Show the plot
plt.show()