Adopt Perke with changes in Hazm 0.9 release

alirezatheh · Jun 22, 2023 · 4c40097 · 4c40097
1 parent 73e5dec
commit 4c40097
Show file tree

Hide file tree

Showing 25 changed files with 99 additions and 136 deletions.
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -10,8 +10,8 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest, macos-latest]
-        python-version: ['3.8', '3.9', '3.10']
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        python-version: ['3.8', '3.9', '3.10', '3.11']
 
     steps:
       - name: Checkout repository
@@ -35,7 +35,7 @@ jobs:
       - name: Download resources
         run: |
           python3 -m pip install .
-          python3 -m perke download ${{ secrets.GITHUB_TOKEN }}
+          python3 -m perke download
 
       - name: Run tests
         run: pytest
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,7 +18,7 @@ repos:
           - --profile=black
           - --line-length=79
   - repo: https://github.com/psf/black
-    rev: 23.1.0
+    rev: 23.3.0
     hooks:
       - id: black
         args:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,13 @@ adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 
 ## [Unreleased]
+### Added
+- Added support for Python `3.11`
+- Brought Windows tests back
+
+### Changed
+- Adopted Perke with all changes in
+  [Hazm](https://github.com/roshan-research/hazm) `0.9` release
 
 ## [0.4.1] - 2023-03-15
 ### Fixed
@@ -32,7 +39,8 @@ adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 - Changed CI from Travis CI to GitHub workflows
 
 ### Removed
-- Removed Windows tests since hazm runs on WSL and WSL tests is same as Linux
+- Removed Windows tests since [Hazm](https://github.com/roshan-research/hazm)
+  runs on WSL and WSL tests is same as Linux
 
 ### Fixed
 - Removed type hints from docstrings

diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![pre-commit.ci](https://results.pre-commit.ci/badge/github/AlirezaTheH/perke/main.svg)](https://results.pre-commit.ci/latest/github/alirezatheh/perke/main)
 [![PyPI Version](https://img.shields.io/pypi/v/perke)](https://pypi.python.org/pypi/perke)
 [![Python Versions](https://img.shields.io/pypi/pyversions/perke)](https://pypi.org/project/perke)
-[![Documentation Status](https://readthedocs.org/projects/perke/badge/?version=stable)](https://perke.readthedocs.io/en/latest/?badge=stable)
+[![Documentation Status](https://readthedocs.org/projects/perke/badge/?version=stable)](https://perke.readthedocs.io/en/stable/?badge=stable)
 
 Perke is a Python keyphrase extraction package for Persian language. It
 provides an end-to-end keyphrase extraction pipeline in which each component

diff --git a/examples/unsupervised/graph_based/multipartite_rank.py b/examples/unsupervised/graph_based/multipartite_rank.py
@@ -3,7 +3,7 @@
 from perke.unsupervised.graph_based import MultipartiteRank
 
 # Define the set of valid part of speech tags to occur in the model.
-valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'}
+valid_pos_tags = {'NOUN', 'ADJ'}
 
 # 1. Create a MultipartiteRank extractor.
 extractor = MultipartiteRank(valid_pos_tags=valid_pos_tags)

diff --git a/examples/unsupervised/graph_based/position_rank.py b/examples/unsupervised/graph_based/position_rank.py
@@ -3,23 +3,27 @@
 from perke.unsupervised.graph_based import PositionRank
 
 # Define the set of valid part of speech tags to occur in the model.
-valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'}
+valid_pos_tags = {'NOUN', 'NOUN,EZ', 'ADJ', 'ADJ,EZ'}
 
 # Define the grammar for selecting the keyphrase candidates
 grammar = r"""
     NP:
-        <P>{<N>}<V>
+        {<NOUN>}<VERB>
     NP:
-        {<DETe?|Ne?|NUMe?|AJe|PRO|CL|RESe?><DETe?|Ne?|NUMe?|AJe?|PRO|CL|RESe?>*}
-        <N>}{<.*e?>
+        {<DET(,EZ)?|NOUN(,EZ)?|NUM(,EZ)?|ADJ(,EZ)|PRON><DET(,EZ)|NOUN(,EZ)|NUM(,EZ)|ADJ(,EZ)|PRON>*}
+        <NOUN>}{<.*(,EZ)?>
 """
 
 # 1. Create a PositionRank extractor.
 extractor = PositionRank(valid_pos_tags=valid_pos_tags)
 
 # 2. Load the text.
 input_filepath = Path(__file__).parent.parent.parent / 'input.txt'
-extractor.load_text(input=input_filepath, word_normalization_method=None)
+extractor.load_text(
+    input=input_filepath,
+    word_normalization_method=None,
+    universal_pos_tags=False,
+)
 
 # 3. Select the noun phrases up to 3 words as keyphrase candidates.
 extractor.select_candidates(grammar=grammar, maximum_word_number=3)

diff --git a/examples/unsupervised/graph_based/single_rank.py b/examples/unsupervised/graph_based/single_rank.py
@@ -3,7 +3,7 @@
 from perke.unsupervised.graph_based import SingleRank
 
 # Define the set of valid part of speech tags to occur in the model.
-valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'}
+valid_pos_tags = {'NOUN', 'ADJ'}
 
 # 1. Create a SingleRank extractor.
 extractor = SingleRank(valid_pos_tags=valid_pos_tags)

diff --git a/examples/unsupervised/graph_based/text_rank.py b/examples/unsupervised/graph_based/text_rank.py
@@ -3,7 +3,7 @@
 from perke.unsupervised.graph_based import TextRank
 
 # Define the set of valid part of speech tags to occur in the model.
-valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'}
+valid_pos_tags = {'NOUN', 'ADJ'}
 
 # 1. Create a TextRank extractor.
 extractor = TextRank(valid_pos_tags=valid_pos_tags)

diff --git a/examples/unsupervised/graph_based/topic_rank.py b/examples/unsupervised/graph_based/topic_rank.py
@@ -1,15 +1,15 @@
-from os.path import dirname, join
+from pathlib import Path
 
 from perke.unsupervised.graph_based import TopicRank
 
 # Define the set of valid part of speech tags to occur in the model.
-valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'}
+valid_pos_tags = {'NOUN', 'ADJ'}
 
 # 1. Create a TopicRank extractor.
 extractor = TopicRank(valid_pos_tags=valid_pos_tags)
 
 # 2. Load the text.
-input_filepath = join(dirname(dirname(dirname(__file__))), 'input.txt')
+input_filepath = Path(__file__).parent.parent.parent / 'input.txt'
 extractor.load_text(input=input_filepath, word_normalization_method='stemming')
 
 # 3. Select the longest sequences of nouns and adjectives, that do

diff --git a/perke/base/extractor.py b/perke/base/extractor.py
@@ -45,7 +45,7 @@ def __init__(self, valid_pos_tags: Optional[Set[str]] = None) -> None:
         ----------
         valid_pos_tags:
             Set of valid part of speech tags, defaults to nouns and
-            adjectives. I.e. `{'N', 'Ne', 'AJ', 'AJe'}`.
+            adjectives. I.e. `{'NOUN', 'ADJ'}`.
         """
         self.word_normalization_method: Optional[str] = None
         self.sentences: List[Sentence] = []
@@ -54,13 +54,14 @@ def __init__(self, valid_pos_tags: Optional[Set[str]] = None) -> None:
             punctuation_marks
         )
         if valid_pos_tags is None:
-            valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'}
+            valid_pos_tags = {'NOUN', 'ADJ'}
         self.valid_pos_tags: Set[str] = valid_pos_tags
 
     def load_text(
         self,
         input: Union[str, Path],
         word_normalization_method: WordNormalizationMethod = 'stemming',
+        universal_pos_tags: bool = True,
     ) -> None:
         """
         Loads the text of a document or string.
@@ -74,9 +75,15 @@ def load_text(
             Word normalization method, defaults to `'stemming'`. See
             `perke.base.types.WordNormalizationMethod` for available
             methods.
+
+        universal_pos_tags:
+            Whether to use universal part of speech tags or not,
+            defaults to `True`.
         """
         # Initialize reader
-        reader = RawTextReader(input, word_normalization_method)
+        reader = RawTextReader(
+            input, word_normalization_method, universal_pos_tags
+        )
 
         # Load sentences
         self.sentences = reader.read()
@@ -225,7 +232,7 @@ def _add_candidate_occurrence(
             The offset of the occurrence
 
         normalized_words:
-            List of normalized of words of the occurrence
+            List of normalized words of the occurrence
         """
         # Build the canonical form of the candidate
         canonical_form = ' '.join(normalized_words)
@@ -306,7 +313,7 @@ def _select_candidates_with_longest_sequences(
                     first = sequence_offsets[0]
                     last = sequence_offsets[-1]
 
-                    # Add the ngram as a new candidate occurrence
+                    # Add the n-gram as a new candidate occurrence
                     self._add_candidate_occurrence(
                         words=sentence.words[first : last + 1],
                         offset=offset_shift + first,
@@ -336,20 +343,20 @@ def _select_candidates_with_grammar(
             defaults to::
                 r\"""
                 NP:
-                    <P>{<N>}<V>
+                    {<NOUN>}<VERB>
                 NP:
-                    {<DETe?|Ne?|NUMe?|AJe|PRO|CL|RESe?><DETe?|Ne?|NUMe?|AJe?|PRO|CL|RESe?>*}
-                    <N>}{<.*e?>'
+                    {<DET(,EZ)?|NOUN(,EZ)?|NUM(,EZ)?|ADJ(,EZ)|PRON><DET(,EZ)|NOUN(,EZ)|NUM(,EZ)|ADJ(,EZ)|PRON>*}
+                    <NOUN>}{<.*(,EZ)?>
                 \"""
         """
         # Initialize default grammar if none provided
         if grammar is None:
             grammar = r"""
                 NP:
-                    <P>{<N>}<V>
+                    {<NOUN>}<VERB>
                 NP:
-                    {<DETe?|Ne?|NUMe?|AJe|PRO|CL|RESe?><DETe?|Ne?|NUMe?|AJe?|PRO|CL|RESe?>*}
-                    <N>}{<.*e?>
+                    {<DET(,EZ)?|NOUN(,EZ)?|NUM(,EZ)?|ADJ(,EZ)|PRON><DET(,EZ)|NOUN(,EZ)|NUM(,EZ)|ADJ(,EZ)|PRON>*}
+                    <NOUN>}{<.*(,EZ)?>
             """
 
         # Initialize parser

diff --git a/perke/base/readers.py b/perke/base/readers.py
@@ -1,4 +1,3 @@
-from os.path import dirname, join
 from pathlib import Path
 from typing import List
 
@@ -31,7 +30,9 @@ class Reader:
     """
 
     def __init__(
-        self, word_normalization_method: WordNormalizationMethod
+        self,
+        word_normalization_method: WordNormalizationMethod,
+        universal_pos_tags: bool,
     ) -> None:
         """
         Initializes the reader.
@@ -42,17 +43,22 @@ def __init__(
             Word normalization method, see
             `perke.base.types.WordNormalizationMethod` for available
             methods.
+
+        universal_pos_tags:
+            Whether to use universal part of speech tags or not
         """
         self.word_normalization_method: WordNormalizationMethod = (
             word_normalization_method
         )
         self.normalizer: hazm.Normalizer = hazm.Normalizer()
         self.stemmer: hazm.Stemmer = hazm.Stemmer()
         self.lemmatizer: hazm.Lemmatizer = hazm.Lemmatizer()
-        model_filepath = join(
-            dirname(dirname(__file__)), 'resources', 'postagger.model'
+        self.pos_tagger: hazm.POSTagger = hazm.POSTagger(
+            model=str(
+                Path(__file__).parent.parent / 'resources' / 'pos_tagger.model'
+            ),
+            universal_tag=universal_pos_tags,
         )
-        self.pos_tagger: hazm.POSTagger = hazm.POSTagger(model=model_filepath)
 
 
 class RawTextReader(Reader):
@@ -69,6 +75,7 @@ def __init__(
         self,
         input: str,
         word_normalization_method: WordNormalizationMethod,
+        universal_pos_tags,
     ) -> None:
         """
         Initializes the reader.
@@ -82,8 +89,11 @@ def __init__(
             Word normalization method, see
             `perke.base.types.WordNormalizationMethod` for available
             methods.
+
+        universal_pos_tags:
+            Whether to use universal part of speech tags or not
         """
-        super().__init__(word_normalization_method)
+        super().__init__(word_normalization_method, universal_pos_tags)
 
         # If input is a filepath
         if isinstance(input, Path):

diff --git a/perke/cli/download.py b/perke/cli/download.py
@@ -1,91 +1,29 @@
-from io import BytesIO
 from pathlib import Path
-from typing import Optional
-from zipfile import ZipFile
 
-import requests
-import rich_click.typer as typer
-from github import Github
-from github.GitReleaseAsset import GitReleaseAsset
+import gdown
 
 from perke.cli.base import app
 
 
 @app.command('download')
-def download_command(
-    github_token: Optional[str] = typer.Argument(
-        None,
-        help=(
-            'The GitHub token to use with GitHub API in order to avoid rate'
-            'limit'
-        ),
-    ),
-) -> None:
+def download_command() -> None:
     """
     Perke requires a trained POS tagger model. We use hazm's tagger
     model. This command aims to easily download latest hazm's resources
     (tagger and parser models).
     """
-    download(github_token)
+    download()
 
 
-def download(github_token: Optional[str] = None) -> None:
+def download() -> None:
     """
     Function version of `download_command` to be available in the
     package.
     """
-    asset = get_latest_resources_asset(github_token)
-    extract_path = Path(__file__).parent.parent / 'resources'
-    download_and_extract_asset(asset, extract_path)
-
-
-def get_latest_resources_asset(github_token: str) -> GitReleaseAsset:
-    """
-    Searches through hazm's releases and find the latest release that
-    contains resources.
-
-    Parameters
-    ----------
-    github_token:
-        The GitHub token to use with GitHub API in order to avoid rate
-        limit
-
-    Returns
-    -------
-    The resources asset
-    """
-    g = Github(login_or_token=github_token)
-    repo = g.get_repo('roshan-research/hazm')
-    for release in repo.get_releases():
-        for asset in release.get_assets():
-            if asset.name.startswith(f'resources-{release.tag_name[1:]}'):
-                return asset
-
-
-def download_and_extract_asset(
-    asset: GitReleaseAsset,
-    extract_path: Path,
-) -> None:
-    """
-    Downloads a GitHub asset file and extract it.
-
-    Parameters
-    ----------
-    asset:
-        The GitHub asset to be downloaded
-
-    extract_path:
-        The extract path for the downloaded file to be extracted
-    """
-    chunk_size = 1024 * 1024
-    with typer.progressbar(
-        length=asset.size, label=f'Downloading {asset.name} ...', fill_char='='
-    ) as progress:
-        with requests.get(url=asset.browser_download_url, stream=True) as r:
-            with BytesIO() as io_file:
-                for chunk in r.iter_content(chunk_size=chunk_size):
-                    io_file.write(chunk)
-                    progress.update(chunk_size)
-                with ZipFile(io_file) as zip_file:
-                    zip_file.extractall(path=extract_path)
-    typer.secho('Download completed.', fg='green')
+    gdown.download(
+        id='1Q3JK4NVUC2t5QT63aDiVrCRBV225E_B3',
+        output=str(
+            Path(__file__).parent.parent / 'resources' / 'pos_tagger.model'
+        ),
+        quiet=False,
+    )