Source code for fortex.stanza.stanza_processor

# Copyright 2019 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from typing import List, Any, Dict, Set

import stanza
from ft.onto.base_ontology import Token, Sentence, Dependency, EntityMention

from forte.common import ProcessorConfigError
from forte.common.configuration import Config
from forte.common.resources import Resources
from forte.data.data_pack import DataPack
from forte.processors.base import PackProcessor

__all__ = ["StandfordNLPProcessor"]


[docs]class StandfordNLPProcessor(PackProcessor): def __init__(self): super().__init__() self.nlp = None self.processors = {} def set_up(self): self.processors = dict(self.configs.processors) stanza.download( self.configs.lang, self.configs.dir, processors=self.processors ) # pylint: disable=unused-argument
[docs] def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) if ( "pos" in configs.processors or "lemma" in configs.processors or "depparse" in configs.processors ): if "tokenize" not in configs.processors: raise ProcessorConfigError( "tokenize is necessary in " "configs.processors for " "pos or lemma or depparse or ner" ) self.set_up() self.nlp = stanza.Pipeline( # type: ignore lang=self.configs.lang, dir=self.configs.dir, use_gpu=self.configs.use_gpu, processors=self.processors, )
[docs] @classmethod def default_configs(cls) -> Dict[str, Any]: """ This defines a basic config structure for StanfordNLP. """ return { # "processors": "tokenize,pos,lemma,depparse,ner", "processors": { "tokenize": "default", "pos": "defualt", "lemma": "default", "depparse": "default", "ner": "i2b2", }, "lang": "en", # Language code for the language to build the Pipeline "use_gpu": False, "dir": ".", }
def _process(self, input_pack: DataPack): doc = input_pack.text if len(doc) == 0: logging.warning("Find empty text in doc.") # sentence parsing sentences = self.nlp(doc).sentences # Iterating through stanfordnlp sentence objects for sentence in sentences: Sentence( input_pack, sentence.tokens[0].start_char, sentence.tokens[-1].end_char, ) tokens: List[Token] = [] if "tokenize" in self.processors: # Iterating through stanfordnlp word objects for word in sentence.words: t_start = word.start_char t_end = word.end_char if t_start < 0 or t_end < 0: raise ValueError( "Cannot determine word start or end for " "stanfordnlp." ) token = Token(input_pack, t_start, t_end) if "pos" in self.processors: token.pos = word.pos token.ud_xpos = word.xpos if "lemma" in self.processors: token.lemma = word.lemma tokens.append(token) # For each sentence, get the dependency relations among tokens if "depparse" in self.processors: # Iterating through token entries in current sentence for token, word in zip(tokens, sentence.words): child = token # current token parent = tokens[word.head - 1] # Head token relation_entry = Dependency(input_pack, parent, child) relation_entry.rel_type = word.deprel # For each sentence, get the entity mentions if "ner" in self.processors: # Iterating through all entities for ent in sentence.entities: entity = EntityMention( input_pack, ent.start_char, ent.end_char ) entity.ner_type = ent.type
[docs] def record(self, record_meta: Dict[str, Set[str]]): r"""Method to add output type record of current processor to :attr:`forte.data.data_pack.Meta.record`. Args: record_meta: the field in the datapack for type record that need to fill in for consistency checking. """ record_meta["ft.onto.base_ontology.Sentence"] = set() if "tokenize" in self.configs.processors: record_meta["ft.onto.base_ontology.Token"] = set() if "pos" in self.configs.processors: record_meta["ft.onto.base_ontology.Token"].add("pos") if "lemma" in self.processors: record_meta["ft.onto.base_ontology.Token"].add("lemma") if "depparse" in self.configs.processors: record_meta["ft.onto.base_ontology.Dependency"] = {"rel_type"} if "ner" in self.configs.processors: record_meta["ft.onto.base_ontology.EntityMention"] = { "ner_type" }