|
| 1 | +# Enterprise Search Tweets - Data request with Tweet type function to demonstrate how to classify Tweets |
| 2 | +# Supports data request only (not counts) and returns parsed Tweet payload (only select fields) |
| 3 | +import argparse |
| 4 | +import json |
| 5 | +import os |
| 6 | +import sys |
| 7 | + |
| 8 | +import requests |
| 9 | +from dotenv import load_dotenv |
| 10 | +load_dotenv(verbose=True) # Throws error if it can't find .env file |
| 11 | + |
| 12 | +# Argparse for cli options. Run `python search_tweet_type.py -h` to see the list of arguments. |
| 13 | +parser = argparse.ArgumentParser() |
| 14 | +parser.add_argument("-r", "--request_file", help="Use json file for request body", |
| 15 | + action="store_true") |
| 16 | +parser.add_argument("-q", "--query", help="A valid query up to 2,048 characters") |
| 17 | +parser.add_argument("-f", "--from_date", help="Oldest date from which results will be provided") |
| 18 | +parser.add_argument("-t", "--to_date", help="Most recent date to which results will be provided") |
| 19 | +parser.add_argument("-m", "--max_results", help="Maximum number of results returned by a single\ |
| 20 | + request/response cycle (range: 10-500, default: 100)") |
| 21 | +parser.add_argument("-b", "--bucket", choices=['day', 'hour', 'minute'], |
| 22 | + help="The unit of time for which count data will be provided.") |
| 23 | +parser.add_argument("-n", "--next", help="Auto paginate through next tokens", action="store_true") |
| 24 | +args = parser.parse_args() |
| 25 | + |
| 26 | +# Retrieves and stores credential information from the '.env' file |
| 27 | +USERNAME = os.getenv("USERNAME") |
| 28 | +PASSWORD = os.getenv("PASSWORD") |
| 29 | +ACCOUNT_NAME = os.getenv("ACCOUNT_NAME") |
| 30 | +ENDPOINT_LABEL = os.getenv("SEARCH_LABEL") |
| 31 | +ARCHIVE = os.getenv ("SEARCH_ARCHIVE") |
| 32 | + |
| 33 | + |
| 34 | +def main(): |
| 35 | + search_endpoint = f"https://gnip-api.twitter.com/search/{ARCHIVE}/accounts/{ACCOUNT_NAME}/{ENDPOINT_LABEL}.json" |
| 36 | + # Build request body from file if it exists, else use cli args |
| 37 | + request_body = build_request_body() |
| 38 | + # Make first request |
| 39 | + first_response = make_request(search_endpoint, request_body) |
| 40 | + # Deserialize json response |
| 41 | + json_response = (json.loads(first_response.text)) |
| 42 | + # Create Python dict from results list |
| 43 | + tweet_results = json_response["results"] |
| 44 | + parsed_results = { "parsed_results": [] } |
| 45 | + |
| 46 | + # Loop through Tweet results to test for type, extended Tweet, and parse JSON |
| 47 | + for tweet in tweet_results: |
| 48 | + extended_tweet = check_for_extended_tweet(tweet) |
| 49 | + tweet_type = determine_tweet_type(tweet) |
| 50 | + if extended_tweet is True: |
| 51 | + text = tweet["extended_tweet"]["full_text"] |
| 52 | + else: |
| 53 | + text = tweet["text"] |
| 54 | + custom_dict = { |
| 55 | + "tweet_id": tweet["id_str"], |
| 56 | + "text": text, |
| 57 | + "tweet_type": tweet_type, |
| 58 | + "hyperlink": "https://twitter.com/twitter/status/" + tweet["id_str"] |
| 59 | + } |
| 60 | + parsed_results["parsed_results"].append(custom_dict) # Add Tweet to parsed_results list |
| 61 | + print(json.dumps(parsed_results, indent=2, sort_keys=True)) |
| 62 | + |
| 63 | + # Pagination logic (if -n flag is passed, paginate through the results) |
| 64 | + if json_response.get("next") is None or args.next is False: |
| 65 | + print(f"Request complete.") |
| 66 | + elif json_response.get("next") is not None and args.next: |
| 67 | + next_token = json_response.get("next") |
| 68 | + request_count = 1 # Keep track of the number of requests being made (pagination) |
| 69 | + while next_token is not None: |
| 70 | + # Update request_body with next token |
| 71 | + request_body.update(next=next_token) |
| 72 | + # Make the request with the next token |
| 73 | + response = make_request(search_endpoint, request_body) |
| 74 | + parsed_results = { "parsed_results": [] } |
| 75 | + # Loop through Tweet results to test for type, extended Tweet, and parse JSON |
| 76 | + for tweet in tweet_results: |
| 77 | + extended_tweet = check_for_extended_tweet(tweet) |
| 78 | + tweet_type = determine_tweet_type(tweet) |
| 79 | + if extended_tweet is True: |
| 80 | + text = tweet["extended_tweet"]["full_text"] |
| 81 | + else: |
| 82 | + text = tweet["text"] |
| 83 | + custom_dict = { |
| 84 | + "tweet_id": tweet["id_str"], |
| 85 | + "text": text, |
| 86 | + "tweet_type": tweet_type, |
| 87 | + "hyperlink": "https://twitter.com/twitter/status/" + tweet["id_str"] |
| 88 | + } |
| 89 | + parsed_results["parsed_results"].append(custom_dict) # Add Tweet to parsed_results |
| 90 | + print(json.dumps(parsed_results, indent=2, sort_keys=True)) |
| 91 | + # Parse n response and it's 'next' token |
| 92 | + n_response = (json.loads(response.text)) |
| 93 | + next_token = n_response.get("next") |
| 94 | + # Iterates the request counter |
| 95 | + request_count += 1 |
| 96 | + print(f"Done paginating.\nTotal requests made: {request_count}") |
| 97 | + |
| 98 | + |
| 99 | +def build_request_body(): |
| 100 | + # Request file will override CLI options |
| 101 | + if args.request_file is True: |
| 102 | + with open("request.json", "r") as read_file: |
| 103 | + request_body = json.load(read_file) |
| 104 | + else: |
| 105 | + request_body = {} |
| 106 | + if args.query: |
| 107 | + request_body.update(query=args.query) |
| 108 | + if args.from_date: |
| 109 | + request_body.update(fromDate=args.from_date) |
| 110 | + if args.to_date: |
| 111 | + request_body.update(toDate=args.to_date) |
| 112 | + if args.max_results: |
| 113 | + request_body.update(maxResults=args.max_results) |
| 114 | + if args.bucket: |
| 115 | + request_body.update(bucket=args.bucket) |
| 116 | + |
| 117 | + return request_body |
| 118 | + |
| 119 | + |
| 120 | +def make_request(endpoint, request_body): |
| 121 | + try: |
| 122 | + response = requests.post(url=endpoint, auth=(USERNAME, PASSWORD), json=request_body) |
| 123 | + except requests.exceptions.RequestException as e: |
| 124 | + print(e) |
| 125 | + sys.exit(120) |
| 126 | + |
| 127 | + return response |
| 128 | + |
| 129 | + |
| 130 | +def determine_tweet_type(tweet): |
| 131 | + # Check for reply indicator first |
| 132 | + if tweet["in_reply_to_status_id"] is not None: |
| 133 | + tweet_type = "Reply Tweet" |
| 134 | + # Check boolean quote status field but make sure it's not a Retweet (of a Quote Tweet) |
| 135 | + elif tweet["is_quote_status"] is True and not tweet["text"].startswith("RT"): |
| 136 | + tweet_type = "Quote Tweet" |
| 137 | + # Check both indicators of a Retweet |
| 138 | + elif tweet["text"].startswith("RT") and tweet.get("retweeted_status") is not None: |
| 139 | + tweet_type = "Retweet" |
| 140 | + else: |
| 141 | + tweet_type = "Original Tweet" |
| 142 | + |
| 143 | + return tweet_type |
| 144 | + |
| 145 | + |
| 146 | +def check_for_extended_tweet(tweet): |
| 147 | + try: |
| 148 | + value = tweet["extended_tweet"] |
| 149 | + return True |
| 150 | + except KeyError: |
| 151 | + return False |
| 152 | + |
| 153 | + |
| 154 | +if __name__ == '__main__': |
| 155 | + main() |
0 commit comments