From 713ec3094eacd2043ff61e24476bc0d6ed6a0c0c Mon Sep 17 00:00:00 2001 From: Ethan White Date: Tue, 16 Apr 2024 22:40:07 -0400 Subject: [PATCH] Use requests instead of urllib.request.urlopen The Python website started serving webpages that were gzipped, which broke the more basic approach. requests handles this automatically and is the widely accepted library for this type of work. Fixes #106 --- offlinedatasci/main.py | 15 ++++++++------- pyproject.toml | 1 + 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/offlinedatasci/main.py b/offlinedatasci/main.py index 0d0fcd3..7505f0f 100644 --- a/offlinedatasci/main.py +++ b/offlinedatasci/main.py @@ -11,6 +11,7 @@ import urllib.request, urllib.error, urllib.parse import importlib_resources import pypi_mirror +import requests import shutil import sys import warnings @@ -189,8 +190,8 @@ def download_rstudio(ods_dir): destination_path = Path(Path(ods_dir), Path("rstudio")) if not os.path.isdir(destination_path): os.makedirs(destination_path) - fp = urllib.request.urlopen(baseurl) - web_content = fp.read() + fp = requests.get(baseurl) + web_content = fp.content soup = bs.BeautifulSoup(web_content, 'lxml') links = soup.find_all('a') for link in links: @@ -214,8 +215,8 @@ def download_python(ods_dir): if not os.path.isdir(destination_path): os.makedirs(destination_path) python_versions = {} - fp = urllib.request.urlopen(url) - web_content = fp.read() + fp = requests.get(url) + web_content = fp.content soup = bs.BeautifulSoup(web_content, 'lxml') r_studio_download_table = soup.find_all('table')[download_table_num] table_body = r_studio_download_table.find('tbody') @@ -239,7 +240,7 @@ def find_r_current_version(url): url -- CRAN r-project URL """ version_regex = "(R\-\d+\.\d+\.\d)+\-(?:x86_64|arm64|win)\.(?:exe|pkg)" - urlfile = urllib.request.urlopen(url) + urlfile = requests.get(url) for line in urlfile: decoded = line.decode("utf-8") match = re.findall(version_regex, decoded) @@ -297,8 +298,8 @@ def get_ods_dir(directory=Path.home()): def get_python_download_page(): """Get download page from Python homepage.""" base_url="https://www.python.org" - fp = urllib.request.urlopen(base_url) - web_content = fp.read() + fp = requests.get(base_url) + web_content = fp.content soup = bs.BeautifulSoup(web_content, "html.parser") release_a_tag = soup.find("a", href=lambda href: href and "release" in href) current_release_path = release_a_tag["href"] diff --git a/pyproject.toml b/pyproject.toml index f13f0c7..df9d3f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ dependencies = [ 'importlib_resources', 'lxml', 'python-pypi-mirror', + 'requests', 'setuptools' ]