Skip to content

Commit

Permalink
Adopt Perke with changes in Hazm 0.9 release
Browse files Browse the repository at this point in the history
  • Loading branch information
alirezatheh committed Jun 22, 2023
1 parent 73e5dec commit 4c40097
Show file tree
Hide file tree
Showing 25 changed files with 99 additions and 136 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
python-version: ['3.8', '3.9', '3.10']
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: ['3.8', '3.9', '3.10', '3.11']

steps:
- name: Checkout repository
Expand All @@ -35,7 +35,7 @@ jobs:
- name: Download resources
run: |
python3 -m pip install .
python3 -m perke download ${{ secrets.GITHUB_TOKEN }}
python3 -m perke download
- name: Run tests
run: pytest
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ repos:
- --profile=black
- --line-length=79
- repo: https://github.com/psf/black
rev: 23.1.0
rev: 23.3.0
hooks:
- id: black
args:
Expand Down
10 changes: 9 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).


## [Unreleased]
### Added
- Added support for Python `3.11`
- Brought Windows tests back

### Changed
- Adopted Perke with all changes in
[Hazm](https://github.com/roshan-research/hazm) `0.9` release

## [0.4.1] - 2023-03-15
### Fixed
Expand All @@ -32,7 +39,8 @@ adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
- Changed CI from Travis CI to GitHub workflows

### Removed
- Removed Windows tests since hazm runs on WSL and WSL tests is same as Linux
- Removed Windows tests since [Hazm](https://github.com/roshan-research/hazm)
runs on WSL and WSL tests is same as Linux

### Fixed
- Removed type hints from docstrings
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[![pre-commit.ci](https://results.pre-commit.ci/badge/github/AlirezaTheH/perke/main.svg)](https://results.pre-commit.ci/latest/github/alirezatheh/perke/main)
[![PyPI Version](https://img.shields.io/pypi/v/perke)](https://pypi.python.org/pypi/perke)
[![Python Versions](https://img.shields.io/pypi/pyversions/perke)](https://pypi.org/project/perke)
[![Documentation Status](https://readthedocs.org/projects/perke/badge/?version=stable)](https://perke.readthedocs.io/en/latest/?badge=stable)
[![Documentation Status](https://readthedocs.org/projects/perke/badge/?version=stable)](https://perke.readthedocs.io/en/stable/?badge=stable)

Perke is a Python keyphrase extraction package for Persian language. It
provides an end-to-end keyphrase extraction pipeline in which each component
Expand Down
2 changes: 1 addition & 1 deletion examples/unsupervised/graph_based/multipartite_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from perke.unsupervised.graph_based import MultipartiteRank

# Define the set of valid part of speech tags to occur in the model.
valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'}
valid_pos_tags = {'NOUN', 'ADJ'}

# 1. Create a MultipartiteRank extractor.
extractor = MultipartiteRank(valid_pos_tags=valid_pos_tags)
Expand Down
14 changes: 9 additions & 5 deletions examples/unsupervised/graph_based/position_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,27 @@
from perke.unsupervised.graph_based import PositionRank

# Define the set of valid part of speech tags to occur in the model.
valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'}
valid_pos_tags = {'NOUN', 'NOUN,EZ', 'ADJ', 'ADJ,EZ'}

# Define the grammar for selecting the keyphrase candidates
grammar = r"""
NP:
<P>{<N>}<V>
{<NOUN>}<VERB>
NP:
{<DETe?|Ne?|NUMe?|AJe|PRO|CL|RESe?><DETe?|Ne?|NUMe?|AJe?|PRO|CL|RESe?>*}
<N>}{<.*e?>
{<DET(,EZ)?|NOUN(,EZ)?|NUM(,EZ)?|ADJ(,EZ)|PRON><DET(,EZ)|NOUN(,EZ)|NUM(,EZ)|ADJ(,EZ)|PRON>*}
<NOUN>}{<.*(,EZ)?>
"""

# 1. Create a PositionRank extractor.
extractor = PositionRank(valid_pos_tags=valid_pos_tags)

# 2. Load the text.
input_filepath = Path(__file__).parent.parent.parent / 'input.txt'
extractor.load_text(input=input_filepath, word_normalization_method=None)
extractor.load_text(
input=input_filepath,
word_normalization_method=None,
universal_pos_tags=False,
)

# 3. Select the noun phrases up to 3 words as keyphrase candidates.
extractor.select_candidates(grammar=grammar, maximum_word_number=3)
Expand Down
2 changes: 1 addition & 1 deletion examples/unsupervised/graph_based/single_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from perke.unsupervised.graph_based import SingleRank

# Define the set of valid part of speech tags to occur in the model.
valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'}
valid_pos_tags = {'NOUN', 'ADJ'}

# 1. Create a SingleRank extractor.
extractor = SingleRank(valid_pos_tags=valid_pos_tags)
Expand Down
2 changes: 1 addition & 1 deletion examples/unsupervised/graph_based/text_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from perke.unsupervised.graph_based import TextRank

# Define the set of valid part of speech tags to occur in the model.
valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'}
valid_pos_tags = {'NOUN', 'ADJ'}

# 1. Create a TextRank extractor.
extractor = TextRank(valid_pos_tags=valid_pos_tags)
Expand Down
6 changes: 3 additions & 3 deletions examples/unsupervised/graph_based/topic_rank.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
from os.path import dirname, join
from pathlib import Path

from perke.unsupervised.graph_based import TopicRank

# Define the set of valid part of speech tags to occur in the model.
valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'}
valid_pos_tags = {'NOUN', 'ADJ'}

# 1. Create a TopicRank extractor.
extractor = TopicRank(valid_pos_tags=valid_pos_tags)

# 2. Load the text.
input_filepath = join(dirname(dirname(dirname(__file__))), 'input.txt')
input_filepath = Path(__file__).parent.parent.parent / 'input.txt'
extractor.load_text(input=input_filepath, word_normalization_method='stemming')

# 3. Select the longest sequences of nouns and adjectives, that do
Expand Down
29 changes: 18 additions & 11 deletions perke/base/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(self, valid_pos_tags: Optional[Set[str]] = None) -> None:
----------
valid_pos_tags:
Set of valid part of speech tags, defaults to nouns and
adjectives. I.e. `{'N', 'Ne', 'AJ', 'AJe'}`.
adjectives. I.e. `{'NOUN', 'ADJ'}`.
"""
self.word_normalization_method: Optional[str] = None
self.sentences: List[Sentence] = []
Expand All @@ -54,13 +54,14 @@ def __init__(self, valid_pos_tags: Optional[Set[str]] = None) -> None:
punctuation_marks
)
if valid_pos_tags is None:
valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'}
valid_pos_tags = {'NOUN', 'ADJ'}
self.valid_pos_tags: Set[str] = valid_pos_tags

def load_text(
self,
input: Union[str, Path],
word_normalization_method: WordNormalizationMethod = 'stemming',
universal_pos_tags: bool = True,
) -> None:
"""
Loads the text of a document or string.
Expand All @@ -74,9 +75,15 @@ def load_text(
Word normalization method, defaults to `'stemming'`. See
`perke.base.types.WordNormalizationMethod` for available
methods.
universal_pos_tags:
Whether to use universal part of speech tags or not,
defaults to `True`.
"""
# Initialize reader
reader = RawTextReader(input, word_normalization_method)
reader = RawTextReader(
input, word_normalization_method, universal_pos_tags
)

# Load sentences
self.sentences = reader.read()
Expand Down Expand Up @@ -225,7 +232,7 @@ def _add_candidate_occurrence(
The offset of the occurrence
normalized_words:
List of normalized of words of the occurrence
List of normalized words of the occurrence
"""
# Build the canonical form of the candidate
canonical_form = ' '.join(normalized_words)
Expand Down Expand Up @@ -306,7 +313,7 @@ def _select_candidates_with_longest_sequences(
first = sequence_offsets[0]
last = sequence_offsets[-1]

# Add the ngram as a new candidate occurrence
# Add the n-gram as a new candidate occurrence
self._add_candidate_occurrence(
words=sentence.words[first : last + 1],
offset=offset_shift + first,
Expand Down Expand Up @@ -336,20 +343,20 @@ def _select_candidates_with_grammar(
defaults to::
r\"""
NP:
<P>{<N>}<V>
{<NOUN>}<VERB>
NP:
{<DETe?|Ne?|NUMe?|AJe|PRO|CL|RESe?><DETe?|Ne?|NUMe?|AJe?|PRO|CL|RESe?>*}
<N>}{<.*e?>'
{<DET(,EZ)?|NOUN(,EZ)?|NUM(,EZ)?|ADJ(,EZ)|PRON><DET(,EZ)|NOUN(,EZ)|NUM(,EZ)|ADJ(,EZ)|PRON>*}
<NOUN>}{<.*(,EZ)?>
\"""
"""
# Initialize default grammar if none provided
if grammar is None:
grammar = r"""
NP:
<P>{<N>}<V>
{<NOUN>}<VERB>
NP:
{<DETe?|Ne?|NUMe?|AJe|PRO|CL|RESe?><DETe?|Ne?|NUMe?|AJe?|PRO|CL|RESe?>*}
<N>}{<.*e?>
{<DET(,EZ)?|NOUN(,EZ)?|NUM(,EZ)?|ADJ(,EZ)|PRON><DET(,EZ)|NOUN(,EZ)|NUM(,EZ)|ADJ(,EZ)|PRON>*}
<NOUN>}{<.*(,EZ)?>
"""

# Initialize parser
Expand Down
22 changes: 16 additions & 6 deletions perke/base/readers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from os.path import dirname, join
from pathlib import Path
from typing import List

Expand Down Expand Up @@ -31,7 +30,9 @@ class Reader:
"""

def __init__(
self, word_normalization_method: WordNormalizationMethod
self,
word_normalization_method: WordNormalizationMethod,
universal_pos_tags: bool,
) -> None:
"""
Initializes the reader.
Expand All @@ -42,17 +43,22 @@ def __init__(
Word normalization method, see
`perke.base.types.WordNormalizationMethod` for available
methods.
universal_pos_tags:
Whether to use universal part of speech tags or not
"""
self.word_normalization_method: WordNormalizationMethod = (
word_normalization_method
)
self.normalizer: hazm.Normalizer = hazm.Normalizer()
self.stemmer: hazm.Stemmer = hazm.Stemmer()
self.lemmatizer: hazm.Lemmatizer = hazm.Lemmatizer()
model_filepath = join(
dirname(dirname(__file__)), 'resources', 'postagger.model'
self.pos_tagger: hazm.POSTagger = hazm.POSTagger(
model=str(
Path(__file__).parent.parent / 'resources' / 'pos_tagger.model'
),
universal_tag=universal_pos_tags,
)
self.pos_tagger: hazm.POSTagger = hazm.POSTagger(model=model_filepath)


class RawTextReader(Reader):
Expand All @@ -69,6 +75,7 @@ def __init__(
self,
input: str,
word_normalization_method: WordNormalizationMethod,
universal_pos_tags,
) -> None:
"""
Initializes the reader.
Expand All @@ -82,8 +89,11 @@ def __init__(
Word normalization method, see
`perke.base.types.WordNormalizationMethod` for available
methods.
universal_pos_tags:
Whether to use universal part of speech tags or not
"""
super().__init__(word_normalization_method)
super().__init__(word_normalization_method, universal_pos_tags)

# If input is a filepath
if isinstance(input, Path):
Expand Down
84 changes: 11 additions & 73 deletions perke/cli/download.py
Original file line number Diff line number Diff line change
@@ -1,91 +1,29 @@
from io import BytesIO
from pathlib import Path
from typing import Optional
from zipfile import ZipFile

import requests
import rich_click.typer as typer
from github import Github
from github.GitReleaseAsset import GitReleaseAsset
import gdown

from perke.cli.base import app


@app.command('download')
def download_command(
github_token: Optional[str] = typer.Argument(
None,
help=(
'The GitHub token to use with GitHub API in order to avoid rate'
'limit'
),
),
) -> None:
def download_command() -> None:
"""
Perke requires a trained POS tagger model. We use hazm's tagger
model. This command aims to easily download latest hazm's resources
(tagger and parser models).
"""
download(github_token)
download()


def download(github_token: Optional[str] = None) -> None:
def download() -> None:
"""
Function version of `download_command` to be available in the
package.
"""
asset = get_latest_resources_asset(github_token)
extract_path = Path(__file__).parent.parent / 'resources'
download_and_extract_asset(asset, extract_path)


def get_latest_resources_asset(github_token: str) -> GitReleaseAsset:
"""
Searches through hazm's releases and find the latest release that
contains resources.
Parameters
----------
github_token:
The GitHub token to use with GitHub API in order to avoid rate
limit
Returns
-------
The resources asset
"""
g = Github(login_or_token=github_token)
repo = g.get_repo('roshan-research/hazm')
for release in repo.get_releases():
for asset in release.get_assets():
if asset.name.startswith(f'resources-{release.tag_name[1:]}'):
return asset


def download_and_extract_asset(
asset: GitReleaseAsset,
extract_path: Path,
) -> None:
"""
Downloads a GitHub asset file and extract it.
Parameters
----------
asset:
The GitHub asset to be downloaded
extract_path:
The extract path for the downloaded file to be extracted
"""
chunk_size = 1024 * 1024
with typer.progressbar(
length=asset.size, label=f'Downloading {asset.name} ...', fill_char='='
) as progress:
with requests.get(url=asset.browser_download_url, stream=True) as r:
with BytesIO() as io_file:
for chunk in r.iter_content(chunk_size=chunk_size):
io_file.write(chunk)
progress.update(chunk_size)
with ZipFile(io_file) as zip_file:
zip_file.extractall(path=extract_path)
typer.secho('Download completed.', fg='green')
gdown.download(
id='1Q3JK4NVUC2t5QT63aDiVrCRBV225E_B3',
output=str(
Path(__file__).parent.parent / 'resources' / 'pos_tagger.model'
),
quiet=False,
)
Loading

0 comments on commit 4c40097

Please sign in to comment.