# Copyright 2021 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Dict, Set
from forte.common.configuration import Config
from forte.common.resources import Resources
from forte.data.data_pack import DataPack
from forte.processors.base import PackProcessor
from ft.onto.base_ontology import EntityMention, Token, Sentence, Phrase
from nltk import ( # type: ignore
pos_tag,
ne_chunk,
PunktSentenceTokenizer,
download,
)
from nltk.chunk import RegexpParser
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordTokenizer
__all__ = [
"NLTKPOSTagger",
"NLTKSentenceSegmenter",
"NLTKWordTokenizer",
"NLTKLemmatizer",
"NLTKChunker",
"NLTKNER",
]
[docs]class NLTKWordTokenizer(PackProcessor):
r"""A wrapper of NLTK word tokenizer."""
def __init__(self):
super().__init__()
self.tokenizer = TreebankWordTokenizer()
def _process(self, input_pack: DataPack):
for begin, end in self.tokenizer.span_tokenize(input_pack.text):
Token(input_pack, begin, end)
[docs] def record(self, record_meta: Dict[str, Set[str]]):
r"""Method to add output type record of `NLTKWordTokenizer`, which is
`ft.onto.base_ontology.Token`,
to :attr:`forte.data.data_pack.Meta.record`.
Args:
record_meta: the field in the datapack for type record that need to
fill in for consistency checking.
"""
record_meta["ft.onto.base_ontology.Token"] = set()
[docs]class NLTKPOSTagger(PackProcessor):
r"""A wrapper of NLTK pos tagger."""
[docs] def initialize(self, resources: Resources, configs: Config):
super().initialize(resources, configs)
download("averaged_perceptron_tagger")
def __init__(self):
super().__init__()
self.token_component = None
def _process(self, input_pack: DataPack):
token_entries = list(
input_pack.get(entry_type=Token, components=self.token_component)
)
token_texts = [token.text for token in token_entries]
taggings = pos_tag(token_texts)
for token, tag in zip(token_entries, taggings):
token.pos = tag[1]
[docs] def record(self, record_meta: Dict[str, Set[str]]):
r"""Method to add output type record of `NLTKPOSTagger`, which adds
attribute `pos` to `ft.onto.base_ontology.Token`
to :attr:`forte.data.data_pack.Meta.record`.
Args:
record_meta: the field in the datapack for type record that need to
fill in for consistency checking.
"""
record_meta["ft.onto.base_ontology.Token"].add("pos")
[docs] def expected_types_and_attributes(self):
r"""Method to add expected type `ft.onto.base_ontology.Token` for input
which would be checked before running the processor if
the pipeline is initialized with
`enforce_consistency=True` or
:meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for
the pipeline.
"""
return {"ft.onto.base_ontology.Token": set()}
[docs]class NLTKLemmatizer(PackProcessor):
r"""A wrapper of NLTK lemmatizer."""
[docs] def initialize(self, resources: Resources, configs: Config):
super().initialize(resources, configs)
download("wordnet")
def __init__(self):
super().__init__()
self.token_component = None
self.lemmatizer = WordNetLemmatizer()
def _process(self, input_pack: DataPack):
token_entries: List[Token] = list(
input_pack.get(entry_type=Token, components=self.token_component)
)
token_texts: List[str] = []
token_poses: List[str] = []
for token in token_entries:
token_texts.append(token.text)
assert token.pos is not None
token_poses.append(penn2morphy(token.pos))
lemmas = [
self.lemmatizer.lemmatize(token_texts[i], token_poses[i])
for i in range(len(token_texts))
]
for token, lemma in zip(token_entries, lemmas):
token.lemma = lemma
[docs] def record(self, record_meta: Dict[str, Set[str]]):
r"""Method to add output type record of `NLTKLemmatizer` which adds
attribute `lemma` to `ft.onto.base_ontology.Token`
to :attr:`forte.data.data_pack.Meta.record`.
Args:
record_meta: the field in the datapack for type record that need to
fill in for consistency checking.
"""
record_meta["ft.onto.base_ontology.Token"].add("lemma")
[docs] def expected_types_and_attributes(self):
r"""Method to add expected type `ft.onto.base_ontology.Token` with
attribute `pos` which
would be checked before running the processor if
the pipeline is initialized with
`enforce_consistency=True` or
:meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for
the pipeline.
"""
return {"ft.onto.base_ontology.Token": {"pos"}}
def penn2morphy(penntag: str) -> str:
r"""Converts tags from Penn format to Morphy."""
morphy_tag = {"NN": "n", "JJ": "a", "VB": "v", "RB": "r"}
if penntag[:2] in morphy_tag:
return morphy_tag[penntag[:2]]
else:
return "n"
[docs]class NLTKChunker(PackProcessor):
r"""A wrapper of NLTK chunker."""
def __init__(self):
super().__init__()
self.chunker = None
[docs] def initialize(self, resources: Resources, configs: Config):
super().initialize(resources, configs)
download("maxent_ne_chunker")
self.chunker = RegexpParser(configs.pattern)
[docs] @classmethod
def default_configs(cls):
r"""This defines a basic config structure for NLTKChunker."""
return {
"pattern": "NP: {<DT>?<JJ>*<NN>}",
"token_component": None,
"sentence_component": None,
}
def _process(self, input_pack: DataPack):
for sentence in input_pack.get(
Sentence, components=self.configs.sentence_component
):
token_entries = list(
input_pack.get(
entry_type=Token,
range_annotation=sentence,
components=self.configs.token_component,
)
)
tokens = [(token.text, token.pos) for token in token_entries]
cs = self.chunker.parse(tokens)
index = 0
for chunk in cs:
if hasattr(chunk, "label"):
# For example:
# chunk: Tree('NP', [('This', 'DT'), ('tool', 'NN')])
begin_pos = token_entries[index].span.begin
end_pos = token_entries[index + len(chunk) - 1].span.end
phrase = Phrase(input_pack, begin_pos, end_pos)
phrase.phrase_type = chunk.label()
index += len(chunk)
else:
# For example:
# chunk: ('is', 'VBZ')
index += 1
[docs] def record(self, record_meta: Dict[str, Set[str]]):
r"""Method to add output type record of `NLTKChunker` which adds
`ft.onto.base_ontology.Phrase` with attribute `phrase_type`
to :attr:`forte.data.data_pack.Meta.record`.
Args:
record_meta: the field in the datapack for type record that need to
fill in for consistency checking.
"""
record_meta["ft.onto.base_ontology.Phrase"] = {"phrase_type"}
[docs] def expected_types_and_attributes(self):
r"""Method to add expected type ft.onto.base_ontology.Token` with
attribute `pos` and `ft.onto.base_ontology.Sentence` which
would be checked before running the processor if
the pipeline is initialized with
`enforce_consistency=True` or
:meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for
the pipeline.
"""
return {
"ft.onto.base_ontology.Sentence": set(),
"ft.onto.base_ontology.Token": {"pos"},
}
[docs]class NLTKSentenceSegmenter(PackProcessor):
r"""A wrapper of NLTK sentence tokenizer."""
[docs] def initialize(self, resources: Resources, configs: Config):
super().initialize(resources, configs)
download("punkt")
def __init__(self):
super().__init__()
self.sent_splitter = PunktSentenceTokenizer()
def _process(self, input_pack: DataPack):
for begin, end in self.sent_splitter.span_tokenize(input_pack.text):
Sentence(input_pack, begin, end)
[docs] def record(self, record_meta: Dict[str, Set[str]]):
r"""Method to add output type record of `NLTKSentenceSegmenter`, which
is `ft.onto.base_ontology.Sentence`
to :attr:`forte.data.data_pack.Meta.record`.
Args:
record_meta: the field in the datapack for type record that need to
fill in for consistency checking.
"""
record_meta["ft.onto.base_ontology.Sentence"] = set()
[docs]class NLTKNER(PackProcessor):
r"""A wrapper of NLTK NER."""
[docs] def initialize(self, resources: Resources, configs: Config):
super().initialize(resources, configs)
download("maxent_ne_chunker")
download("words")
def __init__(self):
super().__init__()
self.token_component = None
def _process(self, input_pack: DataPack):
for sentence in input_pack.get(Sentence):
token_entries = list(
input_pack.get(
entry_type=Token,
range_annotation=sentence,
components=self.token_component,
)
)
tokens = [(token.text, token.pos) for token in token_entries]
ne_tree = ne_chunk(tokens)
index = 0
for chunk in ne_tree:
if hasattr(chunk, "label"):
# For example:
# chunk: Tree('GPE', [('New', 'NNP'), ('York', 'NNP')])
begin_pos = token_entries[index].span.begin
end_pos = token_entries[index + len(chunk) - 1].span.end
entity = EntityMention(input_pack, begin_pos, end_pos)
entity.ner_type = chunk.label()
index += len(chunk)
else:
# For example:
# chunk: ('This', 'DT')
index += 1
[docs] def record(self, record_meta: Dict[str, Set[str]]):
r"""Method to add output type record of `NLTKNER` which is
`ft.onto.base_ontology.EntityMention` with attribute `phrase_type`
to :attr:`forte.data.data_pack.Meta.record`.
Args:
record_meta: the field in the datapack for type record that need to
fill in for consistency checking.
"""
record_meta["ft.onto.base_ontology.EntityMention"] = {"ner_type"}
[docs] def expected_types_and_attributes(self):
r"""Method to add expected type ft.onto.base_ontology.Token` with
attribute `pos` and `ft.onto.base_ontology.Sentence` which
would be checked before running the processor if
the pipeline is initialized with
`enforce_consistency=True` or
:meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for
the pipeline.
"""
return {
"ft.onto.base_ontology.Sentence": set(),
"ft.onto.base_ontology.Token": {"pos"},
}