from typing import List, Dict, Set

from forte.common.configuration import Config
from forte.common.resources import Resources
from import DataPack
from forte.processors.base import PackProcessor
from ft.onto.base_ontology import EntityMention, Token, Sentence, Phrase
from nltk import (  # type: ignore
from nltk.chunk import RegexpParser
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordTokenizer

__all__ = [

[docs]class NLTKWordTokenizer(PackProcessor): r"""A wrapper of NLTK word tokenizer.""" def __init__(self): super().__init__() self.tokenizer = TreebankWordTokenizer() def _process(self, input_pack: DataPack): for begin, end in self.tokenizer.span_tokenize(input_pack.text): Token(input_pack, begin, end)
[docs] def record(self, record_meta: Dict[str, Set[str]]): r"""Method to add output type record of `NLTKWordTokenizer`, which is `ft.onto.base_ontology.Token`, to :attr:``. Args: record_meta: the field in the datapack for type record that need to fill in for consistency checking. """ record_meta["ft.onto.base_ontology.Token"] = set()
[docs]class NLTKPOSTagger(PackProcessor): r"""A wrapper of NLTK pos tagger."""
[docs] def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) download("averaged_perceptron_tagger")
def __init__(self): super().__init__() self.token_component = None def _process(self, input_pack: DataPack): token_entries = list( input_pack.get(entry_type=Token, components=self.token_component) ) token_texts = [token.text for token in token_entries] taggings = pos_tag(token_texts) for token, tag in zip(token_entries, taggings): token.pos = tag[1]
[docs] def record(self, record_meta: Dict[str, Set[str]]): r"""Method to add output type record of `NLTKPOSTagger`, which adds attribute `pos` to `ft.onto.base_ontology.Token` to :attr:``. Args: record_meta: the field in the datapack for type record that need to fill in for consistency checking. """ record_meta["ft.onto.base_ontology.Token"].add("pos")
[docs] def expected_types_and_attributes(self): r"""Method to add expected type `ft.onto.base_ontology.Token` for input which would be checked before running the processor if the pipeline is initialized with `enforce_consistency=True` or :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for the pipeline. """ return {"ft.onto.base_ontology.Token": set()}
[docs]class NLTKLemmatizer(PackProcessor): r"""A wrapper of NLTK lemmatizer."""
[docs] def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) download("wordnet")
def __init__(self): super().__init__() self.token_component = None self.lemmatizer = WordNetLemmatizer() def _process(self, input_pack: DataPack): token_entries: List[Token] = list( input_pack.get(entry_type=Token, components=self.token_component) ) token_texts: List[str] = [] token_poses: List[str] = [] for token in token_entries: token_texts.append(token.text) assert token.pos is not None token_poses.append(penn2morphy(token.pos)) lemmas = [ self.lemmatizer.lemmatize(token_texts[i], token_poses[i]) for i in range(len(token_texts)) ] for token, lemma in zip(token_entries, lemmas): token.lemma = lemma
[docs] def record(self, record_meta: Dict[str, Set[str]]): r"""Method to add output type record of `NLTKLemmatizer` which adds attribute `lemma` to `ft.onto.base_ontology.Token` to :attr:``. Args: record_meta: the field in the datapack for type record that need to fill in for consistency checking. """ record_meta["ft.onto.base_ontology.Token"].add("lemma")
[docs] def expected_types_and_attributes(self): r"""Method to add expected type `ft.onto.base_ontology.Token` with attribute `pos` which would be checked before running the processor if the pipeline is initialized with `enforce_consistency=True` or :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for the pipeline. """ return {"ft.onto.base_ontology.Token": {"pos"}}
def penn2morphy(penntag: str) -> str: r"""Converts tags from Penn format to Morphy.""" morphy_tag = {"NN": "n", "JJ": "a", "VB": "v", "RB": "r"} if penntag[:2] in morphy_tag: return morphy_tag[penntag[:2]] else: return "n"
[docs]class NLTKChunker(PackProcessor): r"""A wrapper of NLTK chunker.""" def __init__(self): super().__init__() self.chunker = None
[docs] def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) download("maxent_ne_chunker") self.chunker = RegexpParser(configs.pattern)
[docs] @classmethod def default_configs(cls): r"""This defines a basic config structure for NLTKChunker.""" return { "pattern": "NP: {<DT>?<JJ>*<NN>}", "token_component": None, "sentence_component": None, }
def _process(self, input_pack: DataPack): for sentence in input_pack.get( Sentence, components=self.configs.sentence_component ): token_entries = list( input_pack.get( entry_type=Token, range_annotation=sentence, components=self.configs.token_component, ) ) tokens = [(token.text, token.pos) for token in token_entries] cs = self.chunker.parse(tokens) index = 0 for chunk in cs: if hasattr(chunk, "label"): # For example: # chunk: Tree('NP', [('This', 'DT'), ('tool', 'NN')]) begin_pos = token_entries[index].span.begin end_pos = token_entries[index + len(chunk) - 1].span.end phrase = Phrase(input_pack, begin_pos, end_pos) phrase.phrase_type = chunk.label() index += len(chunk) else: # For example: # chunk: ('is', 'VBZ') index += 1
[docs] def record(self, record_meta: Dict[str, Set[str]]): r"""Method to add output type record of `NLTKChunker` which adds `ft.onto.base_ontology.Phrase` with attribute `phrase_type` to :attr:``. Args: record_meta: the field in the datapack for type record that need to fill in for consistency checking. """ record_meta["ft.onto.base_ontology.Phrase"] = {"phrase_type"}
[docs] def expected_types_and_attributes(self): r"""Method to add expected type ft.onto.base_ontology.Token` with attribute `pos` and `ft.onto.base_ontology.Sentence` which would be checked before running the processor if the pipeline is initialized with `enforce_consistency=True` or :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for the pipeline. """ return { "ft.onto.base_ontology.Sentence": set(), "ft.onto.base_ontology.Token": {"pos"}, }
[docs]class NLTKSentenceSegmenter(PackProcessor): r"""A wrapper of NLTK sentence tokenizer."""
[docs] def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) download("punkt")
def __init__(self): super().__init__() self.sent_splitter = PunktSentenceTokenizer() def _process(self, input_pack: DataPack): for begin, end in self.sent_splitter.span_tokenize(input_pack.text): Sentence(input_pack, begin, end)
[docs] def record(self, record_meta: Dict[str, Set[str]]): r"""Method to add output type record of `NLTKSentenceSegmenter`, which is `ft.onto.base_ontology.Sentence` to :attr:``. Args: record_meta: the field in the datapack for type record that need to fill in for consistency checking. """ record_meta["ft.onto.base_ontology.Sentence"] = set()
[docs]class NLTKNER(PackProcessor): r"""A wrapper of NLTK NER."""
[docs] def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) download("maxent_ne_chunker") download("words")
def __init__(self): super().__init__() self.token_component = None def _process(self, input_pack: DataPack): for sentence in input_pack.get(Sentence): token_entries = list( input_pack.get( entry_type=Token, range_annotation=sentence, components=self.token_component, ) ) tokens = [(token.text, token.pos) for token in token_entries] ne_tree = ne_chunk(tokens) index = 0 for chunk in ne_tree: if hasattr(chunk, "label"): # For example: # chunk: Tree('GPE', [('New', 'NNP'), ('York', 'NNP')]) begin_pos = token_entries[index].span.begin end_pos = token_entries[index + len(chunk) - 1].span.end entity = EntityMention(input_pack, begin_pos, end_pos) entity.ner_type = chunk.label() index += len(chunk) else: # For example: # chunk: ('This', 'DT') index += 1
[docs] def record(self, record_meta: Dict[str, Set[str]]): r"""Method to add output type record of `NLTKNER` which is `ft.onto.base_ontology.EntityMention` with attribute `phrase_type` to :attr:``. Args: record_meta: the field in the datapack for type record that need to fill in for consistency checking. """ record_meta["ft.onto.base_ontology.EntityMention"] = {"ner_type"}
[docs] def expected_types_and_attributes(self): r"""Method to add expected type ft.onto.base_ontology.Token` with attribute `pos` and `ft.onto.base_ontology.Sentence` which would be checked before running the processor if the pipeline is initialized with `enforce_consistency=True` or :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for the pipeline. """ return { "ft.onto.base_ontology.Sentence": set(), "ft.onto.base_ontology.Token": {"pos"}, }