Исходный код recs_searcher.preprocessing._base_normalize
"""
Алгоритмы для нормализации текста.
"""
from typing import List
from ..base import BaseTransformation
import spacy
[документация]
class LemmatizeSpacy(BaseTransformation):
"""Алгоритм лемматизации слов с помощью Spacy-моделей."""
def __init__(self, spacy_model_name: str):
super().__init__()
try:
self._spacy_model = spacy.load(spacy_model_name)
except OSError:
print(f'Downloading {spacy_model_name}...')
spacy.cli.download(spacy_model_name)
self._spacy_model = spacy.load(spacy_model_name)
[документация]
def _transform(self, array: List[str]) -> List[str]:
transformed_array = []
for text in array:
doc = self._spacy_model(text)
transformed_text = ' '.join([token.lemma_ if token.lemma_ else token for token in doc])
transformed_array.append(transformed_text)
return transformed_array
[документация]
class RemoveStopwordsSpacy(BaseTransformation):
"""Алгоритм удаления стоп-слов с помощью Spacy-моделей."""
def __init__(self, spacy_model_name: str):
super().__init__()
try:
self._spacy_model = spacy.load(spacy_model_name)
except OSError:
print(f'Downloading {spacy_model_name}...')
spacy.cli.download(spacy_model_name)
self._spacy_model = spacy.load(spacy_model_name)
[документация]
def _transform(self, array: List[str]) -> List[str]:
transformed_array = []
for text in array:
clear_tokens = []
doc = self._spacy_model(text)
for token in doc:
if not token.is_stop:
clear_tokens.append(token.text)
# Не возвращаю пустую строку.
if not clear_tokens:
transformed_array.append(text)
else:
transformed_text = ' '.join(clear_tokens)
transformed_array.append(transformed_text)
return transformed_array