Source code for fortex.huggingface.transformers_processor

# Copyright 2019 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict, Any, Set

from nltk.tokenize.util import align_tokens
from transformers import AutoTokenizer
from ft.onto.base_ontology import Subword

from forte.common.configuration import Config
from forte.common.resources import Resources
from forte.data.data_pack import DataPack
from forte.processors.base import PackProcessor

__all__ = [
    "BERTTokenizer",
]


[docs]class BERTTokenizer(PackProcessor):
    r"""A wrapper of BERT tokenizer."""

    def __init__(self):
        super().__init__()
        self.tokenizer = None

    # pylint: disable=unused-argument
[docs]    def initialize(self, resources: Resources, configs: Config):
        self.tokenizer = AutoTokenizer.from_pretrained(configs.model_path)

    def _process(self, input_pack: DataPack):
        inputs = self.tokenizer(input_pack.text, return_tensors="pt")
        tokens = self.tokenizer.convert_ids_to_tokens(
            inputs["input_ids"][0].tolist()
        )[1:-1]
        tokens_clean = [
            token.replace("##", "") if token.startswith("##") else token
            for token in tokens
        ]

        for i, (begin, end) in enumerate(
            align_tokens(tokens_clean, input_pack.text.lower())
        ):
            subword = Subword(input_pack, begin, end)
            subword.is_first_segment = not tokens[i].startswith("##")

[docs]    @classmethod
    def default_configs(cls) -> Dict[str, Any]:
        r"""Returns a `dict` of configurations of the processor with default
        values. Used to replace the missing values of input ``configs`` during
        pipeline construction.
        """
        return {"model_path": None}

[docs]    def record(self, record_meta: Dict[str, Set[str]]):
        r"""Method to add output type `ft.onto.base_ontology.Subword`
        of current processor `BERTTokenizer`
        to :attr:`forte.data.data_pack.Meta.record`.

        Args:
            record_meta: the field in the datapack for type record that need to
                fill in for consistency checking.
        """
        record_meta["ft.onto.base_ontology.Subword"] = {"is_first_segment"}