Source code for fortex.stanza.stanza_processor

# Copyright 2019 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from typing import List, Any, Dict, Set

import stanza
from ft.onto.base_ontology import Token, Sentence, Dependency

from forte.common import ProcessorConfigError
from forte.common.configuration import Config
from forte.common.resources import Resources
from forte.data.data_pack import DataPack
from forte.processors.base import PackProcessor

__all__ = [
    "StandfordNLPProcessor",
]


[docs]class StandfordNLPProcessor(PackProcessor):
    def __init__(self):
        super().__init__()
        self.nlp = None
        self.processors = set()

    def set_up(self):
        stanza.download(self.configs.lang, self.configs.dir)
        self.processors = set(self.configs.processors.split(","))

    # pylint: disable=unused-argument
[docs]    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)
        if (
            "pos" in configs.processors
            or "lemma" in configs.processors
            or "depparse" in configs.processors
        ):
            if "tokenize" not in configs.processors:
                raise ProcessorConfigError(
                    "tokenize is necessary in "
                    "configs.processors for "
                    "pos or lemma or depparse"
                )
        self.set_up()
        self.nlp = stanza.Pipeline(  # type: ignore
            lang=self.configs.lang,
            dir=self.configs.dir,
            use_gpu=self.configs.use_gpu,
            processors=self.configs.processors,
        )

[docs]    @classmethod
    def default_configs(cls) -> Dict[str, Any]:
        """
        This defines a basic config structure for StanfordNLP.
        """
        return {
            "processors": "tokenize,pos,lemma,depparse",
            "lang": "en",
            # Language code for the language to build the Pipeline
            "use_gpu": False,
            "dir": ".",
        }

    def _process(self, input_pack: DataPack):
        doc = input_pack.text

        if len(doc) == 0:
            logging.warning("Find empty text in doc.")

        # sentence parsing
        sentences = self.nlp(doc).sentences

        # Iterating through stanfordnlp sentence objects
        for sentence in sentences:
            Sentence(
                input_pack,
                sentence.tokens[0].start_char,
                sentence.tokens[-1].end_char,
            )

            tokens: List[Token] = []
            if "tokenize" in self.processors:
                # Iterating through stanfordnlp word objects
                for word in sentence.words:
                    misc = word.misc.split("|")

                    t_start = -1
                    t_end = -1
                    for m in misc:
                        k, v = m.split("=")
                        if k == "start_char":
                            t_start = int(v)
                        elif k == "end_char":
                            t_end = int(v)

                    if t_start < 0 or t_end < 0:
                        raise ValueError(
                            "Cannot determine word start or end for "
                            "stanfordnlp."
                        )

                    token = Token(input_pack, t_start, t_end)

                    if "pos" in self.processors:
                        token.pos = word.pos
                        token.ud_xpos = word.xpos

                    if "lemma" in self.processors:
                        token.lemma = word.lemma

                    tokens.append(token)

            # For each sentence, get the dependency relations among tokens
            if "depparse" in self.processors:
                # Iterating through token entries in current sentence
                for token, word in zip(tokens, sentence.words):
                    child = token  # current token
                    parent = tokens[word.head - 1]  # Head token
                    relation_entry = Dependency(input_pack, parent, child)
                    relation_entry.rel_type = word.deprel

[docs]    def record(self, record_meta: Dict[str, Set[str]]):
        r"""Method to add output type record of current processor
        to :attr:`forte.data.data_pack.Meta.record`.

        Args:
            record_meta: the field in the datapack for type record that need to
                fill in for consistency checking.
        """
        record_meta["ft.onto.base_ontology.Sentence"] = set()
        if "tokenize" in self.configs.processors:
            record_meta["ft.onto.base_ontology.Token"] = set()
            if "pos" in self.configs.processors:
                record_meta["ft.onto.base_ontology.Token"].add("pos")
            if "lemma" in self.processors:
                record_meta["ft.onto.base_ontology.Token"].add("lemma")
            if "depparse" in self.configs.processors:
                record_meta["ft.onto.base_ontology.Dependency"] = {"rel_type"}