Source code for fortex.tweepy.twittersearch_processor
# Copyright 2021 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict, Any
import yaml
from ft.onto.base_ontology import Document
import tweepy as tw
from forte.common.configuration import Config
from forte.data.multi_pack import MultiPack
from forte.processors.base import MultiPackProcessor
from forte.data.data_pack import DataPack
__all__ = [
"TweetSearchProcessor",
]
[docs]class TweetSearchProcessor(MultiPackProcessor):
"""
TweetSearchProcessor is designed to query tweets with Tweepy and
Twitter API.
Tweets will be returned as datapacks in input multipack.
"""
[docs] @classmethod
def default_configs(cls) -> Dict[str, Any]:
# pylint: disable=line-too-long
"""This defines a basic config structure for TweetSearchProcessor.
For more details about the parameters, refer to
https://docs.tweepy.org/en/latest/api.html#tweepy.API.search_tweets
and
https://developer.twitter.com/en/docs/twitter-api/v1/tweets/search/api-reference/get-search-tweets
Returns:
A dictionary with the default config for this processor.
Following are the keys for this dictionary:
- `"credential_file"`:
Defines the path of credential file needed for Twitter API usage.
- `"num_tweets_returned"`:
Defines the number of tweets returned by processor.
- `"lang"`:
Language, restricts tweets to the given language, default is 'en'.
- `"date_since"`:
Restricts tweets created after the given date.
- `"result_type"`:
Defines what type of search results to receive. The default is “recent.”
Valid values include:
mixed : include both popular and real time results in the response
recent : return only the most recent results in the response
popular : return only the most popular results in the response.
- `"query_pack_name"`:
The query pack's name, default is "query".
- `"response_pack_name_prefix"`:
The pack name prefix to be used in response data packs.
"""
# pylint: enable=line-too-long
return {
"credential_file": "",
"num_tweets_returned": 5,
"lang": "en",
"date_since": "2020-01-01",
"result_type": "recent",
"query_pack_name": "query",
"response_pack_name_prefix": "passage",
}
def _process(self, input_pack: MultiPack):
r"""Search using Twitter API to fetch tweets for a query.
This query should be contained in the input multipack with name
`self.config.query_pack_name`.
Each result is added as a new data pack, and a
`ft.onto.base_ontology.Document` annotation is used to cover the whole
document.
Args:
input_pack: A multipack containing query as a pack.
"""
query_pack = input_pack.get_pack(self.configs.query_pack_name)
query = query_pack.text
tweets = self._query_tweets(query)
for idx, tweet in enumerate(tweets.data):
if tweet.lang == self.configs.lang:
txt = tweet.text
else:
pass
# skip if the tweet in not in desired language
pack: DataPack = input_pack.add_pack(
f"{self.configs.response_pack_name_prefix}_{idx}"
)
pack.pack_name = f"{self.configs.response_pack_name_prefix}_{idx}"
pack.set_text(txt)
Document(pack=pack, begin=0, end=len(txt))
def _query_tweets(self, query: str):
"""
This function searches tweets using Tweepy.
Args:
query: user's input query for twitter API search
Returns:
List of tweets
"""
with open(self.configs.credential_file, "r", encoding="utf-8") as f:
credentials = yaml.safe_load(f)
credentials = Config(credentials, default_hparams=None)
api = tw.Client( # type: ignore
bearer_token=credentials.bearer_token,
)
# Collect tweets
tweets = api.search_recent_tweets(
query=query,
tweet_fields=["context_annotations", "created_at", "lang"],
max_results=10,
)
return tweets