Premium API Support

This commit adds new functionality to search the Twitter Premium Search API. From the command line you can now: twarc search blacklivesmatter --30day > tweets.jsonl or: twarc search blacklivesmatter --fullarchive > tweets.jsonl Depending on your query this could quickly use up your quota, so you will likely want to use --from_date and --to_date to limit the time range that you are searching. You can also use --limit to limit the number of tweets that are retrieved. If your app is only authorized for the sandbox you can use the --sandbox parameter. This functionality is made available through the new Twarc.premium_search method. Fixes #326
DocNow · Jun 6, 2020 · 691bd5d · 691bd5d
1 parent d49194e
commit 691bd5d
Show file tree

Hide file tree

Showing 7 changed files with 230 additions and 43 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,33 +1,27 @@
 language: python
-
 branches:
   only:
   - master
-
 python:
 - '3.7'
-
 matrix:
   include:
-    - python: 3.7
-      dist: xenial
-      sudo: true
-
+  - python: 3.7
+    dist: xenial
+    sudo: true
 install:
 - python setup.py install
-
 script:
 - python setup.py test
-
 after_failure:
 - cat test.log
-
 notifications:
   slack:
     secure: PY7GyhVBacVqytImbrcyKlDY/uVDTtOMMQ8bEDnhbH9XZA7nOD0dXsEffz93tA1ldJd1+0zsnt9itqdk4pYXBMTcZ6uWE5H2j7KIYylQa2A2Sj5GC3/KjrbFGfqMAXYs+rc9rrlqYkwjztu+bnNqTDnq0Y4Pw0YlMs23NT6GlMg=
 env:
   global:
-  - secure: LnQ/9DvR5tWJ6S5/LjtBGRgabLkgsFSoyhuvaYbJXvP8dMo9r8ZHxZOo5nTiVHpgP+x/OQU2dQrVh1oCzs/5Tv3PkOvSqVcw9Tjsc3VqKelWGuPvF9Ut1Bvd+EPg5BweZ+4HzjBg2GIVxI1EJJ0f/wMCHtokp3FJu5qkSdhrZD0=
-  - secure: B+lxjj4QooAQsvNV14WyqRzbL9u4cnw0WsANSW8lgxE9MRynAgQzg4Mn4KN14DOGuQ7cG/e9H5xQKWk1+JEeuPIT3jjbSoRVE6KEg8Tr0Hx75Vdi8DJc6WoNugYLSV4m5gM1rLP318YhFGtDWUX1e5GaXuN8xsUw8394XHhyDPA=
-  - secure: IdLHln90ogs+WJK0q1j9coH9SVca90FvEXBRG60o/q9de/fQY72UGSWcirJp69cIbAn1BbKiyuH5HANERwOlr+pKRcLDlpre+Ohlo1bKXSQC9Duq00BtnT9auodVNIR/oySZVbrCQsYGw3eSCHdWv4dxmKYieK/Yns2X955u/ak=
-  - secure: AMdu1YG92M0dextcxxwdxiMZ6RCH6Li0WLOgMEumiEoopSkAPnEYTlRoFvvZCo40BUQygnQ+PBoVJFG/PKfkVVucSfBT6AnlQiplc0UElIX55RdribMknwyj8pXEUxSW0Clz9mp00oDqrWx+FRepy5LkSk405L1Skt086IwMExc=
+  - secure: acMO9M6O5XeTxnu6yhtx114qO56nOo25TOCB4FeyneV9iG/JGDOk0PI82ov44xfnUXsCh+6UYEoDP6UiOnJTq4NXg24NeO9dxWZ6k8Y/KeOasubR0nHrH4QN2tfpmJmtgYGJqxaUAnPdCNkQbOcEiKaqDahxzxnpnk54cyoixhg=
+  - secure: O1jB3+f7K4zJ5xybUC99kJ/tVeYSgftWEeOsVwYcn5KwiPO9zNnNvs3YI3nUMghKLkYdxQux6hesCdxgPtCrzNsgHE6Z9hRZfSLVRJmH33EzpHQG+QEBLIqJun9zKTGqM0pnCFmSB284WBbY4adKlSYv8fXeFnFnjY42J7393f8=
+  - secure: IpMaXNonQqzXzCPyUsBflSNH8NSA0ajsLHXi5I43Dk2hXTijrvuf37OCAPV+vvhN39Ry5yofFpTIaesreiqWIrGuIEuUslq0zeTTOUWWQ9wwIh5SiSRIgBSQr3CqjPegyAjMlF2hJBP4LSSMpRyyRo3LvRJYakT6yG51NTmhmNU=
+  - secure: eTLsh5vX/eQB8Fxd1FNpcOggC+3wZTl+5DU+ESdW/NLTEYzuE1Yr4nBG6tGCAHXBPtJKgNUft8vrOhwLl1D8WdSgZqv0ji6kdpIqR2tFRzWt/j/Emh7XFlMdm5pKrtKj0mdgTSpPXqr4upGQVcl2Dn8JWTfQL183XGlRQH3u1zA=
+  - secure: alUgeEe643AHU3cGms+SpKNTh9nU/zKP6QxgY9TBwzaoaYU7wQkYz0B7dXSdU9hRA2aDpdeqyxr5Z9fE610JjSPpsTfSnWwo46RV9BQSqTqvfm9mH0zWsb6IFlnSbQIWSmC1bfbjlaVcFKOSsUDAo7RRMuwuHcQGDQFioB7ugwE=
diff --git a/README.md b/README.md
@@ -262,6 +262,52 @@ To get the users that are on a list you can use the list URL with the
 
     twarc listmembers https://twitter.com/edsu/lists/bots
 
+## Premium APIs
+
+Twitter introduced Premium Search API that let you pay Twitter money for tweets.
+Once you have set up an environment in your
+[dashboard](https://developer.twitter.com/en/dashboard) you can use their 30day
+and fullarchive endpoints to search for tweets outside the 7 day window provided
+by the Standard Search API. To use the premium API from the command line you
+will need to indicate which endpoint you are using, and the environment.
+
+To avoid using up your entire budget you will likely want to limit to some time
+range using `--to_date` and `--from_date`. Additionally you can limit the
+maximum number of tweets returned using `--limit`.
+
+So for example if I wanted to get all the blacklivesmatter tweets from a two
+weeks ago (assuming today is June 1, 2020) using my environment named
+*docnowndev* but not retrieving more than 1000 tweets I could:
+
+    twarc search blacklivesmatter \
+      --30day docnowdev \
+      --from_date 2020-05-01 \
+      --to_date 2020-05-14 \
+      --limit 1000 \
+      > tweets.jsonl
+
+Similarly to find tweets from 2014 using the full archive you can:
+
+    twarc search blacklivesmatter \
+      --fullarchive docnowdev \
+      --from_date 2014-08-04 \
+      --to_date 2014-08-05 \
+      --limit 1000 \
+      > tweets.jsonl
+
+If your environment is sandboxed you will need to use `--sandbox` so that twarc
+knows not to request more than 100 tweets at a time (the default for
+non-sandboxed environments is 500)
+
+    twarc search blacklivesmatter \
+      --fullarchive docnowdev \
+      --from_date 2014-08-04 \
+      --to_date 2014-08-05 \
+      --limit 1000 \
+      --sandbox \
+      > tweets.jsonl
+
+
 ## Use as a Library
 
 If you want you can use twarc programmatically as a library to collect

diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 from setuptools import setup
 
 # Also in twarc/__init__.py
-__version__ = '1.8.5'
+__version__ = '1.9.0'
 
 with open("README.md") as f:
     long_description = f.read()
@@ -28,6 +28,6 @@
         python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*',
         install_requires=dependencies,
         setup_requires=['pytest-runner'],
-        tests_require=['pytest'],
+        tests_require=['pytest', 'python-dotenv'],
         entry_points={'console_scripts': ['twarc = twarc:main']}
     )
diff --git a/test_twarc.py b/test_twarc.py
@@ -2,8 +2,12 @@
 import re
 import json
 import time
-import logging
+import dotenv
 import pytest
+import logging
+import datetime
+
+dotenv.load_dotenv()
 
 try:
     from unittest.mock import patch, call, MagicMock  # Python 3
@@ -633,10 +637,6 @@ def test_truncated_text():
 
 def test_invalid_credentials():
     old_consumer_key = T.consumer_key
-    T.consumer_key = None
-
-    with pytest.raises(RuntimeError):
-        T.validate_keys()
 
     T.consumer_key = 'Definitely not a valid key'
     with pytest.raises(RuntimeError):
@@ -655,18 +655,58 @@ def test_app_auth():
     assert count == 10
 
 
-@pytest.mark.xfail
 def test_labs_v1_sample():
     ta = twarc.Twarc(app_auth=True)
 
     collected = 0
-
     for tweet in ta.labs_v1_sample():
         if 'data' in tweet:
             collected += 1
-
         if collected == 100:
             break
 
     # reconnect to close streaming connection for other tests
     ta.connect()
+
+def test_premium_30day_search():
+    twitter_env = os.environ['TWITTER_ENV']
+    t = twarc.Twarc(app_auth=True)
+    now = datetime.date.today()
+    then = (now - datetime.timedelta(days=14))
+
+    search = t.premium_search(
+        q='blacklivesmatter',
+        product='30day',
+        environment=twitter_env,
+        to_date=then,
+        sandbox=True
+    )
+    tweet = next(search)
+    assert tweet
+
+def test_premium_fullarchive_search():
+    twitter_env = os.environ['TWITTER_ENV']
+    from_date = datetime.date(2013, 7, 1)
+    to_date = datetime.date(2013, 8, 1)
+    t = twarc.Twarc(app_auth=True)
+    search = t.premium_search(
+        q='blacklivesmatter',
+        product='fullarchive',
+        environment=twitter_env,
+        from_date=from_date,
+        to_date=to_date,
+        sandbox=True
+    )
+
+    count = 0
+    for tweet in search:
+        created_at = datetime.datetime.strptime(
+            tweet['created_at'],
+            '%a %b %d %H:%M:%S +0000 %Y'
+        )
+        assert created_at.date() >= from_date
+        assert created_at.date() <= to_date
+        count += 1
+
+    assert count > 200
+
diff --git a/twarc/__init__.py b/twarc/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '1.8.5'  # also in setup.py
+__version__ = '1.9.0'  # also in setup.py
 
 from .client import Twarc
 from .command import main
diff --git a/twarc/client.py b/twarc/client.py
@@ -6,6 +6,7 @@
 import json
 import types
 import logging
+import datetime
 import requests
 
 import ssl
@@ -141,6 +142,71 @@ def search(self, q, max_id=None, since_id=None, lang=None,
 
             max_id = str(int(status["id_str"]) - 1)
 
+    def premium_search(self, q, product, environment, from_date=None,
+            to_date=None, max_results=None, sandbox=False, limit=0):
+        """
+        Search using the Premium Search API. You will need to pass in a query
+        a product (30day or fullarchive) and and environment to use. Optionally
+        you can pass in a from_date and to_date to limit the search using
+        datetime objects. If you would like to set max_results you can, or
+        you can accept the maximum results (500). If using the a sandbox 
+        environment you will want to set sandbox=True to lower the max_results
+        to 100. The limit option will cause your search to finish after it has 
+        return more than that number of tweets (0 means no limit).
+        """
+
+        if not self.app_auth:
+            raise RuntimeError(
+                "This endpoint is only available with application authentication. "
+                "Pass app_auth=True in Python or --app-auth on the command line."
+            )
+
+        if from_date and not isinstance(from_date, datetime.date):
+            raise RuntimeError("from_date must be a datetime.date or datetime.datetime object")
+        if to_date and not isinstance(to_date, datetime.date):
+            raise RuntimeError("to_date must be a datetime.date or datetime.datetime object")
+
+        if product not in ['30day', 'fullarchive']:
+            raise RuntimeError(
+                'Invalid Premium Search API product: {}'.format(product)
+            )
+
+        # set default max_results based on whether its sandboxed
+        if max_results is None:
+            if sandbox:
+                max_results = 100
+            else:
+                max_results = 500
+
+        url = 'https://api.twitter.com/1.1/tweets/search/{}/{}.json'.format(
+            product, 
+            environment
+        )
+
+        params = {
+            "query": q,
+            "fromDate": from_date.strftime('%Y%m%d%H%M') if from_date else None,
+            "toDate": to_date.strftime('%Y%m%d%H%M') if to_date else None,
+            "maxResults": max_results
+        }
+
+        count = 0
+        stop = False
+        while not stop:
+            resp = self.get(url, params=params)
+            if resp.status_code == 200:
+                data = resp.json()
+                for tweet in data['results']:
+                    count += 1
+                    yield tweet
+                    if limit != 0 and count >= limit:
+                        stop = True
+                        break
+                if 'next' in data:
+                    params['next'] = data['next']
+                else:
+                    stop = True
+
     def timeline(self, user_id=None, screen_name=None, max_id=None,
                  since_id=None, max_pages=None):
         """
@@ -488,7 +554,7 @@ def labs_v1_sample(self, event=None, record_keepalive=False):
         while True:
             try:
                 log.info("connecting to labs sample stream")
-                resp = self.get(url, labs_v1_call=True, headers=headers, stream=True)
+                resp = self.get(url, headers=headers, stream=True)
                 errors = 0
                 for raw_line in resp.iter_lines(chunk_size=512):
                     line = raw_line.decode()
@@ -512,6 +578,9 @@ def labs_v1_sample(self, event=None, record_keepalive=False):
                 if self.http_errors and errors == self.http_errors:
                     log.warning("too many errors")
                     raise e
+                if e.response_status_code == 403:
+                    log.warning("access denied for app (403 Error)")
+                    raise e
                 if e.response.status_code == 420:
                     if interruptible_sleep(errors * 60, event):
                         log.info("stopping filter")
@@ -758,16 +827,17 @@ def get(self, *args, **kwargs):
         if not self.client:
             self.connect()
 
-        if "params" in kwargs:
+        # set default tweet_mode
+        if "params" not in kwargs:
+            kwargs["params"] = {"tweet_mode": self.tweet_mode}
+        else:
             kwargs["params"]["tweet_mode"] = self.tweet_mode
-        elif "labs_v1_call" in kwargs:
-            # This is a v1 labs call, so we want to use format=detailed
-            # to capture as much as possible
+
+        # override tweet_mode for labs and premium endpoints
+        if re.search(r"api.twitter.com/labs", args[0]):
             kwargs["params"] = {"format": "detailed"}
-            # Don't pass this downstream to the requests call.
-            del kwargs["labs_v1_call"]
-        else:
-            kwargs["params"] = {"tweet_mode": self.tweet_mode}
+        elif re.search(r"api.twitter.com/1.1/tweets/search/", args[0]):
+            kwargs["params"].pop("tweet_mode")
 
         # Pass allow 404 to not retry on 404
         allow_404 = kwargs.pop('allow_404', False)