forked from EwoutH/shipping-data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmsc_automated.py
195 lines (155 loc) · 8.54 KB
/
msc_automated.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import itertools
import pandas as pd
from time import sleep
from datetime import date
import os
# Create list of origin and destination ports
o_ports = ['BRFOR', 'BRSSZ', 'BRPEC', 'BRSSA', 'BRPNG', 'BRRIO', 'BRRIG', 'BRITJ', 'BRNAT', 'COCTG', 'COSMR', 'COTRB', 'SRPBM', 'CWWIL', 'UYMVD', 'ARBUE', 'CLCNL', 'CLSAI', 'CLVAP', 'PEPAI', 'PECLL', 'ECGYE', 'ECPBO', 'ECPSJ', 'VNVUT', 'VNCMT']
d_ports = ['NLRTM', 'BEANR', 'NLVLI', 'BEZEE', 'NLAMS']
# Make list with all combinations
od_ports = list(itertools.product(o_ports, d_ports))
# Define the base URL to search
url = "https://www.msc.com/en/search-a-schedule"
# Get today's date
today = date.today()
print(today)
# Define search criteria
direct = True
def get_soups(od_ports=od_ports, headless=True):
n_od_ports = len(od_ports)
# Instantiate options
opts = Options()
opts.binary_location = "/usr/bin/google-chrome"
#opts.binary_location = "C:\Program Files\Google\Chrome\Application\chrome.exe"
if headless:
opts.headless = headless
opts.add_argument('--remote-debugging-port=9222')
opts.add_argument('--disable-gpu')
opts.add_argument("--window-size=2160,2160")
# Some options to make Chrome (hopefully) more
opts.add_argument('--disable-blink-features=AutomationControlled')
opts.add_experimental_option('useAutomationExtension', False)
opts.add_experimental_option("excludeSwitches", ["enable-automation"])
# Set the location of the webdriver
os.environ["WDM_PROGRESS_BAR"] = '0'
chrome_service = Service(ChromeDriverManager().install())
# Instantiate a webdriver
driver = webdriver.Chrome(options=opts, service=chrome_service)
# Load the HTML page
driver.get(url)
# Accept cookies
driver.implicitly_wait(3)
try:
driver.find_element(By.ID, "onetrust-accept-btn-handler").click()
except:
print("Cookies already accepted.")
soups = []
for n, od in enumerate(od_ports):
o, d = od
# Reload page
driver.get(url)
# Wait untill page is loaded
WebDriverWait(driver, 1).until(EC.presence_of_element_located((By.ID, "from")))
sleep(0.5)
# TODO: Replace page reload with proper scroll up and clearing the fields:
# Scroll to top of page
#ActionChains(driver).send_keys(Keys.PAGE_UP).perform()
# Clear existing fields - TODO: Does not work yet
#close_buttons = driver.find_elements(By.CSS_SELECTOR, "span.icon-close")
#for btn in close_buttons:
# ActionChains(driver).send_keys(Keys.ENTER).perform()
# Fill in and select the origin and destination port
for label, port in [("from", o), ("to", d)]:
elem = driver.find_element(By.ID, label)
sleep(0.1)
elem.send_keys(port)
sleep(0.7) # TODO make this a proper WebDriverWait
ActionChains(driver).send_keys(Keys.TAB).send_keys(Keys.ENTER).perform()
# Select direct routing
if direct:
driver.find_element(By.ID, "directrouting").click()
sleep(0.1)
# Click search button
driver.find_element(By.CSS_SELECTOR, "button.msc-cta").click()
sleep(0.1)
# Check if there are any connections
no_connections = driver.find_elements(By.XPATH, "//*[contains(text(), 'no results available')]")
if len(no_connections) >= 1:
print(f"NO CONNECTIONS available for route {o} to {d}. {n + 1}/{n_od_ports}")
continue
# Check if connections are present
try:
WebDriverWait(driver, 1).until(EC.presence_of_element_located((By.CLASS_NAME, "msc-search-schedule-result-details__result")))
except:
print(f"NO DATA scraped for route {o} to {d}. {n+1}/{n_od_ports}")
continue
# Find all buttons to open
try:
WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR, "button.msc-search-schedule-result-details__more-button")))
except:
print(f"NO BUTTONS founds for route {o} to {d}. {n + 1}/{n_od_ports}")
sleep(0.05)
buttons = driver.find_elements(By.CSS_SELECTOR, "button.msc-search-schedule-result-details__more-button")
# Open the buttons that are displayed
for btn in buttons:
if btn.is_displayed():
btn.click()
sleep(0.05)
sleep(0.1)
# Create soup and append to soups list
soups.append((BeautifulSoup(driver.page_source, features="html.parser"), od))
print(f"SUCCESSFULLY scraped for route {o} to {d}. {n+1}/{n_od_ports}")
# Close the browser and return soups list and n_results
driver.close()
driver.quit()
return soups
# Returns connection data for input soups
def get_connection_data(soups):
# Create empty list for the connection data
connection_data = []
# Split each soup in different connections
for soup, od in soups:
connections = soup.find_all("div", {"class": "msc-search-schedule-result-details__result"})
o, d = od
# For each connection, find data
for connection in connections:
transit_time = connection.find("span", {"x-text": "entry.TotalTransitTime"}).text
co2_footprint = connection.find("span", {"x-text": "entry.CO2FootPrint"}).text
departure_time = connection.find_all("span", {"x-text": "getLegSequenceValue(entry, 'EstimatedDepartureTimeFormatted')"})[-1].text
departure_port = connection.find_all("span", {"x-text": "getLegSequenceValue(entry, 'DeparturePortName')"})[-1].text
departure_terminal = connection.find_all("span", {"x-text": "getLegSequenceValue(entry, 'DepartureEquipmentHandlingFacilityName')"})[-1].text
service_name = connection.find_all("span", {"x-text": "getLegSequenceValue(entry, 'MaritimeServiceName')"})[-1].text
vessel_name = connection.find_all("span", {"x-text": "getLegSequenceValue(entry, 'Vessel').VesselName"})[-1].text
"getLegSequenceValue(entry, 'Vessel').VesselName"
voyage_no = connection.find_all("span", {"x-text": "getLegSequenceValue(entry, 'DepartureVoyageNo')"})[-1].text
vessel_build_year = connection.find_all("span", {"x-text": "getLegSequenceValue(entry, 'Vessel').VesselBuiltYear"})[-1].text
vessel_flag = connection.find_all("span", {"x-text": "getLegSequenceValue(entry, 'Vessel').VesselFlag"})[-1].text
vessel_imo_code = connection.find_all("span", {"x-text": "getLegSequenceValue(entry, 'Vessel').VesselImoCode"})[-1].text
arrival_port = connection.find_all("span", {"x-text": "getLegSequenceValue(entry, 'ArrivalPortName')"})[-1].text
arrival_terminal = connection.find_all("span", {"x-text": "getLegSequenceValue(entry, 'ArrivalEquipmentHandlingFacilityName')"})[-1].text
arrival_time = connection.find_all("span", {"x-text": "getLegSequenceValue(entry, 'EstimatedArrivalTimeFormatted')"})[-1].text
new_connection_data = [o, departure_port, departure_terminal, d, arrival_port, arrival_terminal, departure_time, arrival_time, transit_time, service_name, co2_footprint, vessel_name, voyage_no, vessel_build_year, vessel_flag, vessel_imo_code]
connection_data.append(new_connection_data)
return connection_data
# Define dict keys / dataframe columns
columns = ["Origin", "Departure port", "Departure terminal", "Destination", "Arrival port", "Arrival terminal", "Departure time", "Arrival time", "Transit time", "Service name", "CO2 footprint", "Vessel name", "Voyage no.", "Vessel build year", "Vessel flag", "Vessel imo code"]
# Scrape the webpage
soups_ods = get_soups(od_ports, headless=True)
# Get all the connection data, process them and add
connection_data = get_connection_data(soups_ods)
# Create a dataframe from all the route data
connection_df = pd.DataFrame(connection_data, columns=columns)
print(f"Done. DataFrame has {connection_df.index.size} entries")
# Save as Pickle and CSV
connection_df.to_pickle(f"../pickles/msc_daily/connections_{today}.pickle")
connection_df.to_csv(f"../data/msc_daily/connections_{today}.csv")