-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathTweetCollectBySearch.py
162 lines (127 loc) · 6.02 KB
/
TweetCollectBySearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# For sending GET requests from the API
import requests
# For saving access tokens and for file management when creating and adding to the dataset
import os
# To create CSV files
import csv
import pandas as pd
# To not spam the API
import time
# Variables to change on each use of the program:
keyword = "search example" # Your given query to search tweets (necessary, cannot search for nothing)
collection_total = 500000 # How many tweets do you want to collect with the API and add into the CSV (MINIMUM OF 100)
filename = "./tweets/return_1.csv" # Name of file to save the data into, type .csv
os.environ['TOKEN'] = 'token' # Bearer Token from your twitter dev account
# Functions of the program:
def auth():
return os.getenv('TOKEN')
def create_headers(bearer_token):
headers = {"Authorization": "Bearer {}".format(bearer_token)}
return headers
def create_url(keyword, max_results = 10):
search_url = "https://api.twitter.com/2/tweets/search/recent" # Change to the endpoint you want to collect data from
keyword = keyword + " lang:pt -is:retweet" # Remove retweets and make sure language is in portuguese, regardless of query
print("Buscando a query: "+keyword)
# change params based on the endpoint you are using
query_params = {'query': keyword,
# 'start_time': start_date,
# 'end_time': end_date,
'max_results': max_results,
# 'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
'tweet.fields': 'geo', # 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
# 'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
# 'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
'next_token': {}}
return (search_url, query_params)
def connect_to_endpoint(url, headers, params, next_token = None):
params['next_token'] = next_token # params object received from create_url function
response = requests.request("GET", url, headers = headers, params = params)
print("Endpoint Response Code: " + str(response.status_code))
if response.status_code != 200:
raise Exception(response.status_code, response.text)
return response.json()
def append_to_csv(json_response, fileName):
#A counter variable
counter = 0
# Open OR create the target CSV file
csvFile = open(fileName, "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)
#Loop through each tweet
for tweet in json_response['data']:
# We will create a variable for each since some of the keys might not exist for some tweets
# So we will account for that
# 1. Author ID
# author_id = tweet['author_id']
# # 2. Time created
# created_at = dateutil.parser.parse(tweet['created_at'])
# # 3. Geolocation
if ('geo' in tweet):
geo = tweet['geo']
else:
geo = " "
# 4. Tweet ID
tweet_id = tweet['id']
# 5. Language
# lang = tweet['lang']
# 6. Tweet metrics
# retweet_count = tweet['public_metrics']['retweet_count']
# reply_count = tweet['public_metrics']['reply_count']
# like_count = tweet['public_metrics']['like_count']
# quote_count = tweet['public_metrics']['quote_count']
# 7. source
# source = tweet['source']
# 8. Tweet text
text = tweet['text']
# Assemble all data you have chosen to collect in a list
res = [tweet_id, text, geo]
# Append the result to the CSV file
csvWriter.writerow(res)
counter += 1
# When done, close the CSV file
csvFile.close()
# Print the number of tweets for this iteration
print("# of Tweets added from this response into CSV: ", counter)
# Main program starts here:
max_results = 100 # Set maximum by API, cannot be any larger (a package of 100 tweets per request)
total_tweets = 0 # Total number of tweets we collected from the loop
next_token = None # Token to paginate tweet responses (allows for a loop until total_tweets is met)
# Creating CSV file
csvFile = open(filename, "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)
# Create headers for the data you want to save into the CSV, in this example, we only want save these columns in our dataset
csvWriter.writerow(['id','tweet','geo'])
csvFile.close()
# Setup to collect tweets with v2 API
bearer_token = auth()
headers = create_headers(bearer_token)
# Collect until given number
while total_tweets < collection_total:
print("-------------------")
print("Token: ", next_token)
# Request from the API
url = create_url(keyword, max_results)
json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
result_count = json_response['meta']['result_count']
# API request already done, now it's a matter of saving it into CSV and setting up the next_token for the next loop
if 'next_token' in json_response['meta']:
# Save the token to use for next call
next_token = json_response['meta']['next_token']
print("Next Token: ", next_token)
if result_count is not None and result_count > 0 and next_token is not None:
append_to_csv(json_response, filename)
total_tweets += result_count
print("Total # of Tweets added: ", total_tweets)
print("-------------------")
time.sleep(1)
# If no next token exists
else:
if result_count is not None and result_count > 0:
print("-------------------")
append_to_csv(json_response, filename)
total_tweets += result_count
print("Total # of Tweets added: ", total_tweets)
print("-------------------")
time.sleep(1)
next_token = None
time.sleep(1)
print("TOTAL number of results: ", total_tweets)