diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 7fd4891..6e2c6fc 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -10,8 +10,8 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, macos-latest] - python-version: ['3.8', '3.9', '3.10'] + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ['3.8', '3.9', '3.10', '3.11'] steps: - name: Checkout repository @@ -35,7 +35,7 @@ jobs: - name: Download resources run: | python3 -m pip install . - python3 -m perke download ${{ secrets.GITHUB_TOKEN }} + python3 -m perke download - name: Run tests run: pytest diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a7914cf..cc9811a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,7 +18,7 @@ repos: - --profile=black - --line-length=79 - repo: https://github.com/psf/black - rev: 23.1.0 + rev: 23.3.0 hooks: - id: black args: diff --git a/CHANGELOG.md b/CHANGELOG.md index cf9ba3e..5741f3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,13 @@ adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- Added support for Python `3.11` +- Brought Windows tests back + +### Changed +- Adopted Perke with all changes in + [Hazm](https://github.com/roshan-research/hazm) `0.9` release ## [0.4.1] - 2023-03-15 ### Fixed @@ -32,7 +39,8 @@ adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - Changed CI from Travis CI to GitHub workflows ### Removed -- Removed Windows tests since hazm runs on WSL and WSL tests is same as Linux +- Removed Windows tests since [Hazm](https://github.com/roshan-research/hazm) + runs on WSL and WSL tests is same as Linux ### Fixed - Removed type hints from docstrings diff --git a/README.md b/README.md index 9e39c16..c7b1922 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![pre-commit.ci](https://results.pre-commit.ci/badge/github/AlirezaTheH/perke/main.svg)](https://results.pre-commit.ci/latest/github/alirezatheh/perke/main) [![PyPI Version](https://img.shields.io/pypi/v/perke)](https://pypi.python.org/pypi/perke) [![Python Versions](https://img.shields.io/pypi/pyversions/perke)](https://pypi.org/project/perke) -[![Documentation Status](https://readthedocs.org/projects/perke/badge/?version=stable)](https://perke.readthedocs.io/en/latest/?badge=stable) +[![Documentation Status](https://readthedocs.org/projects/perke/badge/?version=stable)](https://perke.readthedocs.io/en/stable/?badge=stable) Perke is a Python keyphrase extraction package for Persian language. It provides an end-to-end keyphrase extraction pipeline in which each component diff --git a/examples/unsupervised/graph_based/multipartite_rank.py b/examples/unsupervised/graph_based/multipartite_rank.py index d117d57..552d250 100644 --- a/examples/unsupervised/graph_based/multipartite_rank.py +++ b/examples/unsupervised/graph_based/multipartite_rank.py @@ -3,7 +3,7 @@ from perke.unsupervised.graph_based import MultipartiteRank # Define the set of valid part of speech tags to occur in the model. -valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'} +valid_pos_tags = {'NOUN', 'ADJ'} # 1. Create a MultipartiteRank extractor. extractor = MultipartiteRank(valid_pos_tags=valid_pos_tags) diff --git a/examples/unsupervised/graph_based/position_rank.py b/examples/unsupervised/graph_based/position_rank.py index 9e6ef41..b6dee76 100644 --- a/examples/unsupervised/graph_based/position_rank.py +++ b/examples/unsupervised/graph_based/position_rank.py @@ -3,15 +3,15 @@ from perke.unsupervised.graph_based import PositionRank # Define the set of valid part of speech tags to occur in the model. -valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'} +valid_pos_tags = {'NOUN', 'NOUN,EZ', 'ADJ', 'ADJ,EZ'} # Define the grammar for selecting the keyphrase candidates grammar = r""" NP: -

{} + {} NP: - {*} - }{<.*e?> + {*} + }{<.*(,EZ)?> """ # 1. Create a PositionRank extractor. @@ -19,7 +19,11 @@ # 2. Load the text. input_filepath = Path(__file__).parent.parent.parent / 'input.txt' -extractor.load_text(input=input_filepath, word_normalization_method=None) +extractor.load_text( + input=input_filepath, + word_normalization_method=None, + universal_pos_tags=False, +) # 3. Select the noun phrases up to 3 words as keyphrase candidates. extractor.select_candidates(grammar=grammar, maximum_word_number=3) diff --git a/examples/unsupervised/graph_based/single_rank.py b/examples/unsupervised/graph_based/single_rank.py index 2a94a96..569d78f 100644 --- a/examples/unsupervised/graph_based/single_rank.py +++ b/examples/unsupervised/graph_based/single_rank.py @@ -3,7 +3,7 @@ from perke.unsupervised.graph_based import SingleRank # Define the set of valid part of speech tags to occur in the model. -valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'} +valid_pos_tags = {'NOUN', 'ADJ'} # 1. Create a SingleRank extractor. extractor = SingleRank(valid_pos_tags=valid_pos_tags) diff --git a/examples/unsupervised/graph_based/text_rank.py b/examples/unsupervised/graph_based/text_rank.py index 69a6603..78a8276 100644 --- a/examples/unsupervised/graph_based/text_rank.py +++ b/examples/unsupervised/graph_based/text_rank.py @@ -3,7 +3,7 @@ from perke.unsupervised.graph_based import TextRank # Define the set of valid part of speech tags to occur in the model. -valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'} +valid_pos_tags = {'NOUN', 'ADJ'} # 1. Create a TextRank extractor. extractor = TextRank(valid_pos_tags=valid_pos_tags) diff --git a/examples/unsupervised/graph_based/topic_rank.py b/examples/unsupervised/graph_based/topic_rank.py index ba619aa..b10bab9 100644 --- a/examples/unsupervised/graph_based/topic_rank.py +++ b/examples/unsupervised/graph_based/topic_rank.py @@ -1,15 +1,15 @@ -from os.path import dirname, join +from pathlib import Path from perke.unsupervised.graph_based import TopicRank # Define the set of valid part of speech tags to occur in the model. -valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'} +valid_pos_tags = {'NOUN', 'ADJ'} # 1. Create a TopicRank extractor. extractor = TopicRank(valid_pos_tags=valid_pos_tags) # 2. Load the text. -input_filepath = join(dirname(dirname(dirname(__file__))), 'input.txt') +input_filepath = Path(__file__).parent.parent.parent / 'input.txt' extractor.load_text(input=input_filepath, word_normalization_method='stemming') # 3. Select the longest sequences of nouns and adjectives, that do diff --git a/perke/base/extractor.py b/perke/base/extractor.py index aea3b60..9e3e724 100644 --- a/perke/base/extractor.py +++ b/perke/base/extractor.py @@ -45,7 +45,7 @@ def __init__(self, valid_pos_tags: Optional[Set[str]] = None) -> None: ---------- valid_pos_tags: Set of valid part of speech tags, defaults to nouns and - adjectives. I.e. `{'N', 'Ne', 'AJ', 'AJe'}`. + adjectives. I.e. `{'NOUN', 'ADJ'}`. """ self.word_normalization_method: Optional[str] = None self.sentences: List[Sentence] = [] @@ -54,13 +54,14 @@ def __init__(self, valid_pos_tags: Optional[Set[str]] = None) -> None: punctuation_marks ) if valid_pos_tags is None: - valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'} + valid_pos_tags = {'NOUN', 'ADJ'} self.valid_pos_tags: Set[str] = valid_pos_tags def load_text( self, input: Union[str, Path], word_normalization_method: WordNormalizationMethod = 'stemming', + universal_pos_tags: bool = True, ) -> None: """ Loads the text of a document or string. @@ -74,9 +75,15 @@ def load_text( Word normalization method, defaults to `'stemming'`. See `perke.base.types.WordNormalizationMethod` for available methods. + + universal_pos_tags: + Whether to use universal part of speech tags or not, + defaults to `True`. """ # Initialize reader - reader = RawTextReader(input, word_normalization_method) + reader = RawTextReader( + input, word_normalization_method, universal_pos_tags + ) # Load sentences self.sentences = reader.read() @@ -225,7 +232,7 @@ def _add_candidate_occurrence( The offset of the occurrence normalized_words: - List of normalized of words of the occurrence + List of normalized words of the occurrence """ # Build the canonical form of the candidate canonical_form = ' '.join(normalized_words) @@ -306,7 +313,7 @@ def _select_candidates_with_longest_sequences( first = sequence_offsets[0] last = sequence_offsets[-1] - # Add the ngram as a new candidate occurrence + # Add the n-gram as a new candidate occurrence self._add_candidate_occurrence( words=sentence.words[first : last + 1], offset=offset_shift + first, @@ -336,20 +343,20 @@ def _select_candidates_with_grammar( defaults to:: r\""" NP: -

{} + {} NP: - {*} - }{<.*e?>' + {*} + }{<.*(,EZ)?> \""" """ # Initialize default grammar if none provided if grammar is None: grammar = r""" NP: -

{} + {} NP: - {*} - }{<.*e?> + {*} + }{<.*(,EZ)?> """ # Initialize parser diff --git a/perke/base/readers.py b/perke/base/readers.py index 6e279e8..29eb08a 100644 --- a/perke/base/readers.py +++ b/perke/base/readers.py @@ -1,4 +1,3 @@ -from os.path import dirname, join from pathlib import Path from typing import List @@ -31,7 +30,9 @@ class Reader: """ def __init__( - self, word_normalization_method: WordNormalizationMethod + self, + word_normalization_method: WordNormalizationMethod, + universal_pos_tags: bool, ) -> None: """ Initializes the reader. @@ -42,6 +43,9 @@ def __init__( Word normalization method, see `perke.base.types.WordNormalizationMethod` for available methods. + + universal_pos_tags: + Whether to use universal part of speech tags or not """ self.word_normalization_method: WordNormalizationMethod = ( word_normalization_method @@ -49,10 +53,12 @@ def __init__( self.normalizer: hazm.Normalizer = hazm.Normalizer() self.stemmer: hazm.Stemmer = hazm.Stemmer() self.lemmatizer: hazm.Lemmatizer = hazm.Lemmatizer() - model_filepath = join( - dirname(dirname(__file__)), 'resources', 'postagger.model' + self.pos_tagger: hazm.POSTagger = hazm.POSTagger( + model=str( + Path(__file__).parent.parent / 'resources' / 'pos_tagger.model' + ), + universal_tag=universal_pos_tags, ) - self.pos_tagger: hazm.POSTagger = hazm.POSTagger(model=model_filepath) class RawTextReader(Reader): @@ -69,6 +75,7 @@ def __init__( self, input: str, word_normalization_method: WordNormalizationMethod, + universal_pos_tags, ) -> None: """ Initializes the reader. @@ -82,8 +89,11 @@ def __init__( Word normalization method, see `perke.base.types.WordNormalizationMethod` for available methods. + + universal_pos_tags: + Whether to use universal part of speech tags or not """ - super().__init__(word_normalization_method) + super().__init__(word_normalization_method, universal_pos_tags) # If input is a filepath if isinstance(input, Path): diff --git a/perke/cli/download.py b/perke/cli/download.py index 242d597..99e10b7 100644 --- a/perke/cli/download.py +++ b/perke/cli/download.py @@ -1,91 +1,29 @@ -from io import BytesIO from pathlib import Path -from typing import Optional -from zipfile import ZipFile -import requests -import rich_click.typer as typer -from github import Github -from github.GitReleaseAsset import GitReleaseAsset +import gdown from perke.cli.base import app @app.command('download') -def download_command( - github_token: Optional[str] = typer.Argument( - None, - help=( - 'The GitHub token to use with GitHub API in order to avoid rate' - 'limit' - ), - ), -) -> None: +def download_command() -> None: """ Perke requires a trained POS tagger model. We use hazm's tagger model. This command aims to easily download latest hazm's resources (tagger and parser models). """ - download(github_token) + download() -def download(github_token: Optional[str] = None) -> None: +def download() -> None: """ Function version of `download_command` to be available in the package. """ - asset = get_latest_resources_asset(github_token) - extract_path = Path(__file__).parent.parent / 'resources' - download_and_extract_asset(asset, extract_path) - - -def get_latest_resources_asset(github_token: str) -> GitReleaseAsset: - """ - Searches through hazm's releases and find the latest release that - contains resources. - - Parameters - ---------- - github_token: - The GitHub token to use with GitHub API in order to avoid rate - limit - - Returns - ------- - The resources asset - """ - g = Github(login_or_token=github_token) - repo = g.get_repo('roshan-research/hazm') - for release in repo.get_releases(): - for asset in release.get_assets(): - if asset.name.startswith(f'resources-{release.tag_name[1:]}'): - return asset - - -def download_and_extract_asset( - asset: GitReleaseAsset, - extract_path: Path, -) -> None: - """ - Downloads a GitHub asset file and extract it. - - Parameters - ---------- - asset: - The GitHub asset to be downloaded - - extract_path: - The extract path for the downloaded file to be extracted - """ - chunk_size = 1024 * 1024 - with typer.progressbar( - length=asset.size, label=f'Downloading {asset.name} ...', fill_char='=' - ) as progress: - with requests.get(url=asset.browser_download_url, stream=True) as r: - with BytesIO() as io_file: - for chunk in r.iter_content(chunk_size=chunk_size): - io_file.write(chunk) - progress.update(chunk_size) - with ZipFile(io_file) as zip_file: - zip_file.extractall(path=extract_path) - typer.secho('Download completed.', fg='green') + gdown.download( + id='1Q3JK4NVUC2t5QT63aDiVrCRBV225E_B3', + output=str( + Path(__file__).parent.parent / 'resources' / 'pos_tagger.model' + ), + quiet=False, + ) diff --git a/perke/unsupervised/graph_based/multipartite_rank.py b/perke/unsupervised/graph_based/multipartite_rank.py index 123bdc7..aca6561 100644 --- a/perke/unsupervised/graph_based/multipartite_rank.py +++ b/perke/unsupervised/graph_based/multipartite_rank.py @@ -52,7 +52,7 @@ def __init__(self, valid_pos_tags: Optional[Set[str]] = None) -> None: ---------- valid_pos_tags: Set of valid part of speech tags, defaults to nouns and - adjectives. I.e. `{'N', 'Ne', 'AJ', 'AJe'}`. + adjectives. I.e. `{'NOUN', 'ADJ'}`. """ super().__init__(valid_pos_tags) self.topic_ids: Dict[str, int] = {} diff --git a/perke/unsupervised/graph_based/position_rank.py b/perke/unsupervised/graph_based/position_rank.py index 3f0c9c9..268a337 100644 --- a/perke/unsupervised/graph_based/position_rank.py +++ b/perke/unsupervised/graph_based/position_rank.py @@ -42,8 +42,10 @@ def __init__(self, valid_pos_tags: Optional[Set[str]] = None) -> None: ---------- valid_pos_tags: Set of valid part of speech tags, defaults to nouns and - adjectives. I.e. `{'N', 'Ne', 'AJ', 'AJe'}`. + adjectives. I.e. `{'NOUN', 'NOUN,EZ', 'ADJ', 'ADJ,EZ'}`. """ + if valid_pos_tags is None: + valid_pos_tags = {'NOUN', 'NOUN,EZ', 'ADJ', 'ADJ,EZ'} super().__init__(valid_pos_tags) self.positions: DefaultDict[str, float] = defaultdict(float) @@ -66,22 +68,21 @@ def select_candidates( defaults to:: r\""" NP: -

{} + {} NP: - {*} - }{<.*e?>. + {*} + }{<.*(,EZ)?> \""" - maximum_length: `int` Maximum length in words of the candidate, defaults to `3`. """ if grammar is None: grammar = r""" NP: -

{} + {} NP: - {*} - }{<.*e?> + {*} + }{<.*(,EZ)?> """ # Select sequence of noun phrases with given pattern diff --git a/perke/unsupervised/graph_based/single_rank.py b/perke/unsupervised/graph_based/single_rank.py index 26bbb5a..0a1922d 100644 --- a/perke/unsupervised/graph_based/single_rank.py +++ b/perke/unsupervised/graph_based/single_rank.py @@ -40,7 +40,7 @@ def __init__(self, valid_pos_tags: Optional[Set[str]] = None) -> None: ---------- valid_pos_tags: Set of valid part of speech tags, defaults to nouns and - adjectives. I.e. `{'N', 'Ne', 'AJ', 'AJe'}`. + adjectives. I.e. `{'NOUN', 'ADJ'}`. """ super().__init__(valid_pos_tags) self.graph_edges_are_weighted: bool = True diff --git a/perke/unsupervised/graph_based/text_rank.py b/perke/unsupervised/graph_based/text_rank.py index a49866d..e984db6 100644 --- a/perke/unsupervised/graph_based/text_rank.py +++ b/perke/unsupervised/graph_based/text_rank.py @@ -50,7 +50,7 @@ def __init__(self, valid_pos_tags: Optional[Set[str]] = None) -> None: ---------- valid_pos_tags: Set of valid part of speech tags, defaults to nouns and - adjectives. I.e. `{'N', 'Ne', 'AJ', 'AJe'}`. + adjectives. I.e. `{'NOUN', 'ADJ'}`. """ super().__init__(valid_pos_tags) self.graph: nx.Graph = nx.Graph() diff --git a/perke/unsupervised/graph_based/topic_rank.py b/perke/unsupervised/graph_based/topic_rank.py index 96fe8bb..3abdff6 100644 --- a/perke/unsupervised/graph_based/topic_rank.py +++ b/perke/unsupervised/graph_based/topic_rank.py @@ -54,7 +54,7 @@ def __init__(self, valid_pos_tags: Optional[Set[str]] = None) -> None: ---------- valid_pos_tags: `set[str]`, optional Set of valid part of speech tags, defaults to nouns and - adjectives. I.e. `{'N', 'Ne', 'AJ', 'AJe'}`. + adjectives. I.e. `{'NOUN', 'ADJ'}`. """ super().__init__(valid_pos_tags) self.graph: nx.Graph = nx.Graph() diff --git a/requirements/README.md b/requirements/README.md deleted file mode 100644 index 918dafc..0000000 --- a/requirements/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# pip requirements files -- [`main.txt`](main.txt): Default requirements -- [`test.txt`](test.txt): Requirements for running test suite -- [`documentation.txt`](documentation.txt): Requirements for building the - documentation diff --git a/requirements/main.txt b/requirements/main.txt index a01b8e3..3731833 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -4,5 +4,4 @@ networkx scipy typer==0.5.0 rich-click==1.5.2 -PyGithub -requests +gdown diff --git a/setup.py b/setup.py index 4925954..f2b2aa8 100644 --- a/setup.py +++ b/setup.py @@ -75,6 +75,7 @@ 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', 'Operating System :: OS Independent', 'Intended Audience :: Developers', 'Intended Audience :: Education', diff --git a/tests/test_multipartite_rank.py b/tests/test_multipartite_rank.py index 7a45e5f..513fa41 100644 --- a/tests/test_multipartite_rank.py +++ b/tests/test_multipartite_rank.py @@ -7,4 +7,4 @@ def test_original_article_default(text: str) -> None: extractor.select_candidates() extractor.weight_candidates() keyphrases = [keyphrase for keyphrase, weight in extractor.get_n_best(n=3)] - assert keyphrases == ['رایانه', 'طبیعی', 'پردازش زبان گفتاری'] + assert keyphrases == ['رایانه', 'انسانی', 'طبیعی'] diff --git a/tests/test_position_rank.py b/tests/test_position_rank.py index 0dab485..d8738f0 100644 --- a/tests/test_position_rank.py +++ b/tests/test_position_rank.py @@ -3,12 +3,12 @@ def test_original_article_default(text: str) -> None: extractor = PositionRank() - extractor.load_text(input=text) + extractor.load_text(input=text, universal_pos_tags=False) extractor.select_candidates() extractor.weight_candidates() keyphrases = [keyphrase for keyphrase, weight in extractor.get_n_best(n=3)] assert keyphrases == [ - 'پردازش زبان‌های طبیعی', - 'پردازش زبان طبیعی', - 'پردازش زبان گفتاری', + 'کاربردهای پردازش زبان', + 'پردازش زبان‌های', + 'کاربردهای گفتاری پردازش', ] diff --git a/tests/test_single_rank.py b/tests/test_single_rank.py index 43e9a55..41c6f6c 100644 --- a/tests/test_single_rank.py +++ b/tests/test_single_rank.py @@ -8,7 +8,7 @@ def test_original_article_default(text: str) -> None: extractor.weight_candidates() keyphrases = [keyphrase for keyphrase, weight in extractor.get_n_best(n=3)] assert keyphrases == [ - 'کاربردهای پردازش زبان طبیعی', 'کاربردهای متنوع پردازش زبان‌های طبیعی', - 'پردازش زبان طبیعی', + 'کاربردهای پردازش زبان طبیعی', + 'پردازش زبان‌های طبیعی عبارت', ] diff --git a/tests/test_text_rank.py b/tests/test_text_rank.py index 9c0daa6..bccc14e 100644 --- a/tests/test_text_rank.py +++ b/tests/test_text_rank.py @@ -8,8 +8,8 @@ def test_original_article_default(text: str) -> None: keyphrases = [keyphrase for keyphrase, weight in extractor.get_n_best(n=3)] assert keyphrases == [ 'کاربردهای پردازش زبان طبیعی', - 'کاربردهای گفتاری پردازش زبان', 'زمینه درک زبان طبیعی', + 'پردازش اطلاعات زبانی', ] @@ -22,5 +22,5 @@ def test_with_candidate_selection(text: str) -> None: assert keyphrases == [ 'کاربردهای متنوع پردازش زبان‌های طبیعی', 'کاربردهای پردازش زبان طبیعی', - 'کاربردهای گفتاری پردازش زبان', + 'زمینه درک زبان طبیعی', ] diff --git a/tests/test_topic_rank.py b/tests/test_topic_rank.py index 71bdb81..405e0e9 100644 --- a/tests/test_topic_rank.py +++ b/tests/test_topic_rank.py @@ -7,4 +7,4 @@ def test_original_article_default(text: str) -> None: extractor.select_candidates() extractor.weight_candidates() keyphrases = [keyphrase for keyphrase, weight in extractor.get_n_best(n=3)] - assert keyphrases == ['طبیعی', 'رایانه', 'پردازش زبان گفتاری'] + assert keyphrases == ['رایانه', 'پردازش زبان گفتاری', 'طبیعی']