Source code for fortex.huggingface.question_and_answering_single

# Copyright 2021 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Wrapper of the Question Answering models on HuggingFace platform (context
understanding)
"""
import importlib
from typing import Dict, Set

from transformers import pipeline
from ft.onto.base_ontology import Phrase

from forte.common import Resources
from forte.common.configuration import Config
from forte.data.data_pack import DataPack
from forte.processors.base import PackProcessor

__all__ = [
    "QuestionAnsweringSingle",
]


[docs]class QuestionAnsweringSingle(PackProcessor):
    r"""Wrapper of the models on HuggingFace platform with pipeline tag of
    `question-answering` (reading comprehension).
    https://huggingface.co/models?pipeline_tag=question-answering
    This wrapper could take any model name on HuggingFace platform with pipeline
    tag of `question-answering` in configs to make prediction on the context of
    user specified entry type in the input pack and the prediction result would
    be annotated as `Phrase` in the output pack. User could input the question
    in the config.
    """

    def __init__(self):
        super().__init__()
        self.extractor = None

    def set_up(self):
        device_num = self.configs["cuda_devices"]
        self.extractor = pipeline(
            "question-answering",
            model=self.configs.model_name,
            tokenizer=self.configs.model_name,
            framework="pt",
            device=device_num,
        )

[docs]    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)
        self.set_up()

    def _process(self, input_pack: DataPack):
        path_str, module_str = self.configs.entry_type.rsplit(".", 1)

        mod = importlib.import_module(path_str)
        entry = getattr(mod, module_str)
        for entry_specified in input_pack.get(entry_type=entry):
            result = self.extractor(
                context=entry_specified.text,
                question=self.configs.question,
                max_answer_len=self.configs.max_answer_len,
                handle_impossible_answer=self.configs.handle_impossible_answer,
            )
            start = result["start"]
            end = result["end"]
            Phrase(pack=input_pack, begin=start, end=end)

[docs]    @classmethod
    def default_configs(cls):
        r"""This defines a basic config structure for `QuestionAnsweringSingle`.

        Following are the keys for this dictionary:
            - `entry_type`: defines which entry type in the input pack to make
              prediction on. The default makes prediction on each `Document`
              in the input pack.
            - `model_name`: language model, default is
              `"ktrapeznikov/biobert_v1.1_pubmed_squad_v2"`.
              The wrapper supports Hugging Face models with pipeline tag of
              `question-answering`.
            - `question`: One question to retrieve answer from the input pack
              context.
            - `max_answer_len`: The maximum length of predicted answers (e.g.,
              only answers with a shorter length are considered).
            - `cuda_device`: Device ordinal for CPU/GPU supports. Setting
              this to -1 will leverage CPU, a positive will run the model
              on the associated CUDA device id.
            - `handle_impossible_answer`: Whether or not we accept
              impossible as an answer.

        Returns: A dictionary with the default config for this processor.
        """
        return {
            "entry_type": "ft.onto.base_ontology.Document",
            "model_name": "ktrapeznikov/biobert_v1.1_pubmed_squad_v2",
            "question": "Where do I live",
            "max_answer_len": 15,
            "cuda_devices": -1,
            "handle_impossible_answer": False,
        }

[docs]    def expected_types_and_attributes(self):
        r"""Method to add user specified expected type which
        would be checked before running the processor if
        the pipeline is initialized with
        `enforce_consistency=True` or
        :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for
        the pipeline.
        """
        return {self.configs["entry_type"]: set()}

[docs]    def record(self, record_meta: Dict[str, Set[str]]):
        r"""Method to add output type record of `QuestionAnsweringSingle` which
        is `"ft.onto.base_ontology.Phrase"`
        to :attr:`forte.data.data_pack.Meta.record`.

        Args:
            record_meta: the field in the datapack for type record that need to
                fill in for consistency checking.
        """
        if "ft.onto.base_ontology.Phrase" not in record_meta.keys():
            record_meta["ft.onto.base_ontology.Phrase"] = set()