From f43baa32688fb6bd90a519e1578d9a3a0a6bfdad Mon Sep 17 00:00:00 2001 From: stefan6419846 <96178532+stefan6419846@users.noreply.github.com> Date: Thu, 15 Jun 2023 09:56:19 +0200 Subject: [PATCH] fix temporary directory cleanup and add documentation --- CHANGELOG.md | 5 + license_tools/scancode_tools.py | 193 ++++++++++++++++++++++++++++++-- setup.py | 2 +- 3 files changed, 191 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 796c8ce..3d86496 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Development version +# Version 0.2.0 - 2023-06-15 + +* Make sure to delete the temporary run-specific directory on exit. +* Add some code documentation. + # Version 0.1.3 - 2023-06-14 * Avoid running shared object linking analysis twice. diff --git a/license_tools/scancode_tools.py b/license_tools/scancode_tools.py index 1eaccca..6464ad7 100644 --- a/license_tools/scancode_tools.py +++ b/license_tools/scancode_tools.py @@ -2,8 +2,14 @@ # SPDX-License-Identifier: Apache-2.0 # See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +""" +Convenience interface for the ScanCode toolkit project using some predefined +configuration and returning `dataclass` instances instead of dictionaries. +""" + from __future__ import annotations +import atexit import datetime import shutil import subprocess @@ -14,6 +20,8 @@ from tempfile import TemporaryDirectory from typing import Generator +import scancode_config +from commoncode import fileutils from joblib import Parallel, delayed from scancode import api @@ -23,6 +31,10 @@ @dataclass class Author: + """ + Matching information about an author. + """ + author: str start_line: int end_line: int @@ -30,6 +42,10 @@ class Author: @dataclass class Holder: + """ + Matching information about a copyright holder. + """ + holder: str start_line: int end_line: int @@ -37,6 +53,10 @@ class Holder: @dataclass class Copyright: + """ + Matching information about copyrights. + """ + copyright: str start_line: int end_line: int @@ -44,6 +64,10 @@ class Copyright: @dataclass class Copyrights: + """ + Copyright-specific results. + """ + copyrights: list[Copyright] = dataclass_field(default_factory=list) holders: list[Holder] = dataclass_field(default_factory=list) authors: list[Author] = dataclass_field(default_factory=list) @@ -56,6 +80,10 @@ def __post_init__(self): @dataclass class Email: + """ + Matching information about an e-mail. + """ + email: str start_line: int end_line: int @@ -63,6 +91,10 @@ class Email: @dataclass class Emails: + """ + E-mail-specific results. + """ + emails: list[Email] = dataclass_field(default_factory=list) def __post_init__(self): @@ -71,6 +103,10 @@ def __post_init__(self): @dataclass class Url: + """ + Matching information about an URL. + """ + url: str start_line: int end_line: int @@ -78,6 +114,10 @@ class Url: @dataclass class Urls: + """ + URL-specific results. + """ + urls: list[Url] = dataclass_field(default_factory=list) def __post_init__(self): @@ -86,6 +126,10 @@ def __post_init__(self): @dataclass class FileInfo: + """ + File-specific results. + """ + date: datetime.date size: int sha1: str @@ -107,6 +151,10 @@ def __post_init__(self): @dataclass class LicenseMatch: + """ + Matching information about a license. + """ + score: float start_line: int end_line: int @@ -121,6 +169,10 @@ class LicenseMatch: @dataclass class LicenseDetection: + """ + Information an a specific detected license. + """ + license_expression: str identifier: str matches: list[LicenseMatch] = dataclass_field(default_factory=list) @@ -131,6 +183,10 @@ def __post_init__(self): @dataclass class Licenses: + """ + Information on all detected licenses. + """ + detected_license_expression: str detected_license_expression_spdx: str percentage_of_license_text: float @@ -153,13 +209,22 @@ def get_scores_of_detected_license_expression_spdx(self): @dataclass class FileResults: + """ + Container for all available file-level results. + """ + + # Reference to the analyzed file. path: Path short_path: str + + # Configuration values to determine which information to retrieve. retrieve_copyrights: bool = False retrieve_emails: bool = False retrieve_urls: bool = False retrieve_licenses: bool = False retrieve_file_info: bool = False + + # Analysis results. copyrights: Copyrights = NOT_REQUESTED emails: Emails = NOT_REQUESTED urls: Urls = NOT_REQUESTED @@ -180,7 +245,13 @@ def __post_init__(self): self.file_info = FileInfo(**api.get_file_info(path_str)) -def check_shared_objects(path: Path, short_path: str) -> str: +def check_shared_objects(path: Path) -> str | None: + """ + Check which other shared objects a shared object links to. + + :param path: The file path to analyze. + :return: The analysis results if the path points to a shared object, `None` otherwise. + """ if path.suffix != '.so' and not (path.suffixes and path.suffixes[0] == '.so'): return output = subprocess.check_output(['ldd', path], stderr=subprocess.PIPE) @@ -196,10 +267,27 @@ def run_on_file( retrieve_urls: bool = False, retrieve_ldd_data: bool = False, ) -> FileResults: + """ + Run the analysis on the given file. + + :param path: The file path to analyze. + :param short_path: The short path to use for display. + :param retrieve_copyrights: Whether to retrieve copyright information. + :param retrieve_emails: Whether to retrieve e-mails. + :param retrieve_file_info: Whether to retrieve file-specific information. + :param retrieve_urls: Whether to retrieve URLs. + :param retrieve_ldd_data: Whether to retrieve linking data for shared objects. + :return: The requested results. + """ + # This data is not yet part of the dataclasses above, as it is a custom analysis. if retrieve_ldd_data: - results = check_shared_objects(path=path, short_path=short_path) + results = check_shared_objects(path=path) if results: print(short_path + '\n' + results) + + # Register this here as each parallel process has its own directory. + atexit.register(cleanup, scancode_config.scancode_temp_dir) + return FileResults( path=path, short_path=short_path, @@ -220,6 +308,18 @@ def run_on_directory( retrieve_urls: bool = False, retrieve_ldd_data: bool = False, ) -> Generator[FileResults, None, None]: + """ + Run the analysis on the given directory. + + :param path: The directory to analyze. + :param job_count: The number of parallel jobs to use. + :param retrieve_copyrights: Whether to retrieve copyright information. + :param retrieve_emails: Whether to retrieve e-mails. + :param retrieve_file_info: Whether to retrieve file-specific information. + :param retrieve_urls: Whether to retrieve URLs. + :param retrieve_ldd_data: Whether to retrieve linking data for shared objects. + :return: The requested results per file. + """ common_prefix_length = len(directory) + int(not directory.endswith("/")) def get_paths() -> tuple[Path, str]: @@ -245,8 +345,25 @@ def get_paths() -> tuple[Path, str]: def run_on_package_archive_file( archive_path: Path, - **kwargs + job_count: int = 4, + retrieve_copyrights: bool = False, + retrieve_emails: bool = False, + retrieve_file_info: bool = False, + retrieve_urls: bool = False, + retrieve_ldd_data: bool = False, ) -> Generator[FileResults, None, None]: + """ + Run the analysis on the given package archive file. + + :param path: The package archive path to analyze. + :param job_count: The number of parallel jobs to use. + :param retrieve_copyrights: Whether to retrieve copyright information. + :param retrieve_emails: Whether to retrieve e-mails. + :param retrieve_file_info: Whether to retrieve file-specific information. + :param retrieve_urls: Whether to retrieve URLs. + :param retrieve_ldd_data: Whether to retrieve linking data for shared objects. + :return: The requested results. + """ with TemporaryDirectory() as working_directory: if archive_path.suffix == ".whl": # `shutil.unpack_archive` cannot handle wheel files. @@ -256,15 +373,37 @@ def run_on_package_archive_file( shutil.unpack_archive(archive_path, working_directory) yield from run_on_directory( directory=working_directory, - **kwargs + job_count=job_count, + retrieve_copyrights=retrieve_copyrights, + retrieve_emails=retrieve_emails, + retrieve_file_info=retrieve_file_info, + retrieve_urls=retrieve_urls, ) def run_on_downloaded_package_file( package_definition: str, index_url: str | None = None, - **kwargs + job_count: int = 4, + retrieve_copyrights: bool = False, + retrieve_emails: bool = False, + retrieve_file_info: bool = False, + retrieve_urls: bool = False, + retrieve_ldd_data: bool = False, ) -> Generator[FileResults, None, None]: + """ + Run the analysis for the given package definition. + + :param package_definition: The package definition to get the files for. + :param index_url: The PyPI index URL to use. Uses the default one from the `.pypirc` file if unset. + :param job_count: The number of parallel jobs to use. + :param retrieve_copyrights: Whether to retrieve copyright information. + :param retrieve_emails: Whether to retrieve e-mails. + :param retrieve_file_info: Whether to retrieve file-specific information. + :param retrieve_urls: Whether to retrieve URLs. + :param retrieve_ldd_data: Whether to retrieve linking data for shared objects. + :return: The requested results. + """ with TemporaryDirectory() as download_directory: command = [ "pip", @@ -280,15 +419,29 @@ def run_on_downloaded_package_file( name = list(Path(download_directory).glob("*"))[0] yield from run_on_package_archive_file( archive_path=name.resolve(), - **kwargs + job_count=job_count, + retrieve_copyrights=retrieve_copyrights, + retrieve_emails=retrieve_emails, + retrieve_file_info=retrieve_file_info, + retrieve_urls=retrieve_urls, ) -def _check_only_one_value_set(values): +def _check_that_exactly_one_value_is_set(values: list[Path | str | None]) -> bool: + """ + Check that exactly one value does not evaluate to `False`. + """ filtered = list(filter(None, values)) return len(filtered) == 1 +def cleanup(directory: Path | str) -> None: + """ + Remove the given directory. + """ + fileutils.delete(directory) + + def run( directory: Path | str | None = None, file_path: Path | str | None = None, @@ -301,7 +454,28 @@ def run( retrieve_urls: bool = False, retrieve_ldd_data: bool = False, ) -> FileResults: - assert _check_only_one_value_set([directory, file_path, package_definition]), 'Exactly one source is required.' + """ + Run the analysis for the given input definition. + + The `directory`, `file_path` and `package_definition` parameters are mutually exclusive, + but exactly one has to be set. + + :param directory: The directory to run on. + :param file_path: The file to run on. + :param package_definition: The package definition to run for. + :param index_url: The PyPI index URL to use. Uses the default one from the `.pypirc` file if unset. + :param job_count: The number of parallel jobs to use. + :param retrieve_copyrights: Whether to retrieve copyright information. + :param retrieve_emails: Whether to retrieve e-mails. + :param retrieve_file_info: Whether to retrieve file-specific information. + :param retrieve_urls: Whether to retrieve URLs. + :param retrieve_ldd_data: Whether to retrieve linking data for shared objects. + :return: The requested results. + """ + # Remove the temporary directory of the main thread. + atexit.register(cleanup, scancode_config.scancode_temp_dir) + + assert _check_that_exactly_one_value_is_set([directory, file_path, package_definition]), 'Exactly one source is required.' license_counts = defaultdict(int) kwargs = dict( @@ -313,6 +487,7 @@ def run( job_count=job_count, ) + # Run the analysis itself. if package_definition: results = list( run_on_downloaded_package_file( @@ -338,6 +513,7 @@ def run( ) ] + # Display the file-level results. for result in results: scores = result.licenses.get_scores_of_detected_license_expression_spdx() print( @@ -349,6 +525,7 @@ def run( ) license_counts[result.licenses.detected_license_expression_spdx] += 1 + # Display the license-level results. print() print("=" * 130) print() diff --git a/setup.py b/setup.py index 68bb388..56bc9d6 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setuptools.setup( name='license_tools', description='Collection of tools for working with Open Source licenses', - version='0.1.3', + version='0.2.0', license='Apache-2.0', long_description=Path(ROOT_DIRECTORY / 'README.md').read_text(encoding='UTF-8'), long_description_content_type='text/markdown',