Commit f79dbef9 authored by J. Fernando Sánchez's avatar J. Fernando Sánchez
Browse files

Merge branch 'twitter-scraper-twint'

parents 8f8cc478 23ce48d9
......@@ -8,7 +8,7 @@ build:
name: gcr.io/kaniko-project/executor:debug
entrypoint: [""]
tags:
- docker
- minsky
script:
- echo "{\"auths\":{\"$CI_REGISTRY\":{\"username\":\"$CI_REGISTRY_USER\",\"password\":\"$CI_REGISTRY_PASSWORD\"},\"https://index.docker.io/v1/\":{ \"auth\":\"$HUB_AUTH\"}}}" > /kaniko/.docker/config.json
# The skip-tls-verify flag is there because our registry certificate is self signed
......
FROM python:3.5
FROM python:3.6
RUN groupadd -g 999 crawler && \
useradd -r -u 999 -g crawler crawler
......@@ -14,6 +14,7 @@ RUN mkdir -p /usr/src/app
ADD requirements.txt /usr/src/app/
RUN pip install -r /usr/src/app/requirements.txt
RUN pip install --upgrade git+https://github.com/twintproject/twint.git@origin/master#egg=twint
ADD . /usr/src/app
......
import tweepy
import twint
import json
import os
import argparse
import time
import logging
CONSUMER_KEY = os.environ['TWITTER_CONSUMER_KEY']
CONSUMER_SECRET = os.environ['TWITTER_CONSUMER_SECRET']
......@@ -30,54 +32,118 @@ def timeline(api, query, count):
return api.user_timeline(screen_name=query, count=count)
def retrieveTweets(querytype, query, count=200, keep=False):
def retrieveTweets(querytype, query, count=200, keep=False, library="twint", before=None, after=None):
if library == "twint":
consumer_key = CONSUMER_KEY
consumer_secret = CONSUMER_SECRET
access_token = ACCESS_TOKEN
access_token_secret = TOKEN_SECRET
consumer_key = CONSUMER_KEY
consumer_secret = CONSUMER_SECRET
access_token = ACCESS_TOKEN
access_token_secret = TOKEN_SECRET
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
api = tweepy.API(auth)
t = twint.Config()
t.Search = query
t.Store_object = True
t.Limit = int(count)
t.Near = "europe"
found = []
if before:
t.Until = before
if after:
t.Since = after
if querytype == 'search':
found = search(api, query, count)
elif querytype == 'timeline':
found = timeline(api, query, count)
else:
raise Exception('Unknown query type')
try:
twint.run.Search(t)
except:
pass
results = []
for tweet in twint.output.tweets_list:
mytweet = tweet if keep else {}
try:
mytweet["schema:locationCreated"] = api.get_user(tweet.user_id_str, tweet.username).location
except:
pass
mytweet["@type"] = ["schema:BlogPosting", ]
mytweet["@id"] = 'https://twitter.com/{screen_name}/status/{id}'.format(screen_name=tweet.username, id=tweet.id)
mytweet["schema:about"] = query
mytweet["schema:search"] = query
mytweet["schema:articleBody"] = tweet.tweet
mytweet["schema:headline"] = tweet.tweet
mytweet["schema:creator"] = tweet.username
mytweet["schema:author"] = 'twitter'
mytweet["schema:inLanguage"] = tweet.lang
mytweet["schema:keywords"] = tweet.hashtags
mytweet["schema:datePublished"] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.strptime(tweet.datetime,'%Y-%m-%d %H:%M:%S %Z'))
if tweet.reply_to:
mytweet["@type"].append("schema:Comment")
mytweet["schema:parentItem"] = 'https://twitter.com/{screen_name}/status/{id}'.format(screen_name=tweet.reply_to[0]["screen_name"], id=tweet.reply_to[0]["id"])
results.append(mytweet)
print(json.dumps(results, indent=3))
return results
elif library == "tweepy":
consumer_key = CONSUMER_KEY
consumer_secret = CONSUMER_SECRET
access_token = ACCESS_TOKEN
access_token_secret = TOKEN_SECRET
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
results = []
for item in found:
jsontweet = json.dumps(item._json)
tweet = json.loads(jsontweet)
api = tweepy.API(auth)
mytweet = tweet if keep else {}
found = []
mytweet["@type"] = ["schema:BlogPosting", ]
mytweet["photo"] = tweet['user']['profile_image_url']
mytweet["@id"] = 'https://twitter.com/{screen_name}/status/{id}'.format(screen_name=tweet['user']['screen_name'], id=tweet["id"])
mytweet["schema:about"] = query
mytweet["schema:search"] = query
mytweet["schema:articleBody"] = tweet["text"]
mytweet["schema:headline"] = tweet["text"]
mytweet["schema:creator"] = tweet['user']['screen_name']
mytweet["schema:author"] = 'twitter'
mytweet["source"] = 'twitter'
mytweet["schema:datePublished"] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y'))
if querytype == 'search':
found = search(api, query, count)
logging.warning(found)
elif querytype == 'timeline':
found = timeline(api, query, count)
else:
raise Exception('Unknown query type')
if tweet["in_reply_to_status_id"]:
mytweet["@type"].append("schema:Comment")
mytweet["schema:parentItem"] = 'https://twitter.com/{screen_name}/status/{id}'.format(screen_name=tweet["in_reply_to_screen_name"], id=tweet["in_reply_to_status_id"])
results = []
for item in found:
jsontweet = json.dumps(item._json)
tweet = json.loads(jsontweet)
results.append(mytweet)
mytweet = tweet if keep else {}
return results
mytweet["@type"] = ["schema:BlogPosting", ]
mytweet["photo"] = tweet['user']['profile_image_url']
mytweet["@id"] = 'https://twitter.com/{screen_name}/status/{id}'.format(screen_name=tweet['user']['screen_name'], id=tweet["id"])
mytweet["schema:about"] = query
mytweet["schema:search"] = query
mytweet["schema:articleBody"] = tweet["text"]
mytweet["schema:headline"] = tweet["text"]
mytweet["schema:creator"] = tweet['user']['screen_name']
mytweet["schema:author"] = 'twitter'
mytweet["source"] = 'twitter'
mytweet["schema:datePublished"] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y'))
if tweet["in_reply_to_status_id"]:
mytweet["@type"].append("schema:Comment")
mytweet["schema:parentItem"] = 'https://twitter.com/{screen_name}/status/{id}'.format(screen_name=tweet["in_reply_to_screen_name"], id=tweet["in_reply_to_status_id"])
results.append(mytweet)
return results
else:
return [{}]
\ No newline at end of file
......@@ -73,9 +73,8 @@ def crawler(func):
@celery.task
@crawler
def twitter_scraper(query, querytype, number, keep):
return retrieveTweets(querytype, query, number, keep)
def twitter_scraper(query, querytype, number, keep, library, before, after):
return retrieveTweets(querytype, query, number, keep, library, before, after)
@celery.task
@crawler
......
......@@ -126,6 +126,28 @@ paths:
default: false
in: query
description: "Keep all original fields in the response"
- name: library
schema:
type: string
enum:
- tweepy
- twint
default: twint
in: query
required: true
description: "Select the library used to retrieve the tweets. Tweepy uses the official Twitter API with the its limitations. Twint uses a different method and doesn' require API keys."
- name: before
schema:
type: string
default: ""
in: query
description: "Search for tweets up until this date. Only works for the Twint library. Date format is '2021-01-31'."
- name: after
schema:
type: string
default: ""
in: query
description: "Search for tweets since this date. Only works for the Twint library. Date format is '2021-01-31'."
- name: number
schema:
type: integer
......
......@@ -15,3 +15,4 @@ beautifulsoup4
unidecode
newsapi-python
nytimesarticle
tornado
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment