Commit 5ea8c11b authored by J. Fernando Sánchez's avatar J. Fernando Sánchez
Browse files

Merge branch 'participation'

parents 67a22f62 c0467e5b
FROM python:3.6
FROM python:3.8
RUN groupadd -g 999 crawler && \
useradd -r -u 999 -g crawler crawler
......@@ -14,8 +14,7 @@ RUN mkdir -p /usr/src/app
ADD requirements.txt /usr/src/app/
RUN pip install -r /usr/src/app/requirements.txt
RUN pip install --upgrade git+https://github.com/twintproject/twint.git@origin/master#egg=twint
RUN pip install git+https://github.com/JustAnotherArchivist/snscrape.git
ADD . /usr/src/app
ENV LC_ALL C
......
......@@ -17,7 +17,7 @@ services:
networks:
- gsicrawler
elasticsearch:
image: "docker.elastic.co/elasticsearch/elasticsearch:5.5.2"
image: "docker.elastic.co/elasticsearch/elasticsearch:7.12.0"
ulimits:
memlock:
soft: -1
......
TWITTER_ACCESS_TOKEN=<YOUR VALUE>
TWITTER_ACCESS_TOKEN_SECRET=<YOUR VALUE>
TWITTER_CONSUMER_KEY=<YOUR VALUE>
TWITTER_CONSUMER_SECRET=<YOUR VALUE>
FACEBOOK_USER=<YOUR VALUE>
FACEBOOK_PASSWORD=<YOUR VALUE>
NEWS_API_KEY=<YOUR VALUE>
NY_TIMES_API_KEY=<YOUR VALUE>
DISCORD_TOKEN=<YOUR VALUE>
\ No newline at end of file
......@@ -60,3 +60,11 @@ def nyt_scraper(**kwargs):
@scraper('AlJazeera')
def aljazeera_scraper(**kwargs):
return current_app.tasks.aljazeera_scraper
@scraper('Discord')
def discord_scraper(**kwargs):
return current_app.tasks.dsicord_scraper
@scraper('Reddit')
def reddit_scraper(**kwargs):
return current_app.tasks.reddit_scraper
\ No newline at end of file
import discord
import os
import datetime
import asyncio
import json
async def getMessages(guild_id, before=None, after=None, number=100):
# Create Disocrd client using token
client = discord.Client()
await client.login(os.environ["DISCORD_TOKEN"], bot=False)
# Get guild by its ID
guild = await client.fetch_guild(guild_id)
# Get channels in guild
channels = await guild.fetch_channels()
# Filter only text channels
txt_channels = list(filter(lambda channel: channel.type == discord.ChannelType.text, channels))
after_dt = datetime.datetime.strptime(after, '%Y-%m-%d') if not after is None else None
before_dt = datetime.datetime.strptime(before, '%Y-%m-%d') if not before is None else None
# Get messages from every text channel
msgs = []
for channel in txt_channels:
try:
msgs += await channel.history(limit=number, after=after_dt, before=before_dt).flatten()
except:
pass
# Semantically annotate messages
mapped_msgs = list(map(lambda msg: {
"@type": ["schema:BlogPosting", ],
"@id": msg.id,
"schema:articleBody": msg.content,
"schema:headline": msg.content,
"schema:creator": msg.author.name,
"schema:datePublished": msg.created_at.strftime('%Y-%m-%dT%H:%M:%SZ'),
# "schema:inLanguage": "en",
"source": "Discord",
"community": guild.name
}, msgs))
await client.close()
return mapped_msgs
def retrieveDiscordMessages(guild_id, before=None, after=None):
# Make calls to Discrd API synchronous
loop = asyncio.get_event_loop()
coroutine = getMessages(guild_id, before, after)
messages = loop.run_until_complete(coroutine)
print(json.dumps(messages, indent=3))
return messages
\ No newline at end of file
#!/usr/bin/env python
import requests
import time
from facebook_scraper import get_posts
import os
import json
user = os.environ["FACEBOOK_USER"]
password = os.environ["FACEBOOK_PASSWORD"]
# I generate the access_token by creating an empty Facebook App which gives me
# the app_id and app_secret needed.
# app_id = os.environ['FACEBOOK_APP_ID']
# app_secret= os.environ['FACEBOOK_APP_SECRET']
# Concatening them I am sure it won`t expire.
# access_token= app_id + "|" + app_secret
access_token = os.environ.get('FACEBOOK_ACCESS_TOKEN')
def getFBPageFeedData (page_id, num_status=10):
# We store the ids we are going to analyse
# Funcion utilizada para captar posibles errores y reintentar tras 5 segundos
def request_until_succeed(url, max_tries=5):
#print (url)
success = False
response = None
for i in range(max_tries):
try:
response = requests.get(url)
#print(response.json())
if response.status_code == 200:
break
# It randomly throws PermissionError sometimes but, after a few retries, it works
while(True):
try:
print("Getting posts")
posts = get_posts(page_id, credentials=(user, password))
except PermissionError:
print("Login error")
continue
except:
print ("Retrying. There was an error for URL %s: %s" % (url, response.text))
time.sleep(5)
return response
# Reducimos a 1 las stories, con el fin de poder procesarla facilmente
# La llamare una vez por pagina que quiera analizar
# FIELDS:
# - message : texto de la noticia
# - link : enlace url a la noticia en si
# - created_time : fecha de publicacion de la noticia
# - type : tipo de contenido (foto, video...)
# - name : nombre de la publicacion (?)
# - id : id de la publicacion
# - reactions.type(LIKE).summary(total_count).limit(0).as(like)) : extrae el numero de likes (total_count)
# - comments.limit(1).summary(true) : extrae numero de comentarios y el ultimo (+ usuario + contenido + info)
# - shares&limit= : extrae el numero de veces que se ha compartido la noticia
def getFBPageFeedData (page_id, num_status):
page_idbak = page_id
# Concatening them I am sure it won`t expire.
base = "https://graph.facebook.com/v4.0"
node = "/" + page_id + "/feed"
parameters = "/?fields=created_time,story,message,name,id,reactions.type(LIKE).summary(total_count).limit(0).as(like),comments.limit(20).summary(true)&limit=%s&access_token=%s" % (num_status, access_token)
url = base + node + parameters
response = request_until_succeed(url)
data = response.json()
code = response.status_code
if code != 200:
raise Exception('Could not fetch data (error {}): {}'.format(code, response.text))
print ("Analisis de %s realizado!" %page_id)
#print(data['data'])
results = []
for post in data['data']:
#print(post)
aux = dict()
if 'message' in post:
aux["@type"] = "schema:BlogPosting"
aux["@id"] = 'https://www.facebook.com/'+page_idbak+'/posts/'+post["id"].split('_')[1]
aux["schema:datePublished"] = post["created_time"]
aux["schema:articleBody"] = post["message"]
aux["schema:author"] = 'facebook'
aux["schema:creator"] = page_idbak
aux["schema:search"] = page_idbak
for i, comment in enumerate(post['comments']['data']):
print(comment)
try:
url = base + '/{userid}/picture?redirect=false&height=100&access_token={token}'.format(userid=comment['id'],token=access_token)
profile = request_until_succeed(url)
print(profile)
post['comments']['data'][i]['from']['photo'] = profile['data']['url']
except:
print('No profile picture')
aux['comments'] = post['comments']
aux['likes'] = post['like']['summary']['total_count']
results.append(aux)
return results
if __name__ == '__main__':
getFBPageFeedData ('restauranteslateral', 10)
break
else:
break
# Mapping
mapped_posts = []
for post in posts:
mapped_posts.append({
"@type": ["schema:BlogPosting", ],
"@id": post["post_id"],
"schema:articleBody": post["text"],
"schema:headline": post["text"],
"schema:creator": post["username"],
"schema:datePublished": post["time"].strftime('%Y-%m-%dT%H:%M:%SZ'),
"source": "Facebook",
"community": page_id
})
print(json.dumps(mapped_posts, indent=3))
return mapped_posts
import os
import sys
import datetime
from psaw import PushshiftAPI
import json
def retrieveRedditPosts(subreddit, endpoint="comments", after=None, before=None, number=100):
print('Download from r/{}, after={} and before={}'.format(subreddit, after, before))
after = int(datetime.datetime.strptime(after, '%Y-%m-%d').timestamp()) if not after is None else None
before = int(datetime.datetime.strptime(before, '%Y-%m-%d').timestamp()) if not before is None else None
api = PushshiftAPI()
if endpoint == 'submissions':
gen = api.search_submissions(subreddit=subreddit, after=after, before=before, limit=number)
elif endpoint == 'comments':
gen = api.search_comments(subreddit=subreddit, after=after, before=before, limit=number)
else:
raise ValueError("endpoint {} is not submissions or comments".format(endpoint))
mapped_json = list(map(lambda data: {
"@type": ["schema:BlogPosting", ],
"@id": id(data.d_["id"]),
"schema:articleBody": data.d_["body"],
"schema:about": subreddit,
"schema:search": subreddit,
"schema:author": "Reddit",
"schema:headline": data.d_["body"],
"schema:creator": data.d_["author"],
"schema:datePublished": datetime.datetime.fromtimestamp(data.d_["created_utc"]).strftime('%Y-%m-%dT%H:%M:%SZ'),
# "schema:inLanguage": "en",
"source": "Reddit",
"community": subreddit
},gen))
return mapped_json
import tweepy
import twint
import json
import os
import argparse
import time
import logging
from snscrape.modules import twitter
CONSUMER_KEY = os.environ['TWITTER_CONSUMER_KEY']
CONSUMER_SECRET = os.environ['TWITTER_CONSUMER_SECRET']
......@@ -32,67 +32,42 @@ def timeline(api, query, count):
return api.user_timeline(screen_name=query, count=count)
def retrieveTweets(querytype, query, count=200, keep=False, library="twint", before=None, after=None):
if library == "twint":
def retrieveTweets(querytype, query, count=0, keep=False, library="tweepy", before=None, after=None):
if library == "snscrape":
full_query = '{} since:{} until:{} '.format(query, after, before)
scraper = twitter.TwitterSearchScraper(full_query)
tweet_list = []
consumer_key = CONSUMER_KEY
consumer_secret = CONSUMER_SECRET
access_token = ACCESS_TOKEN
access_token_secret = TOKEN_SECRET
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
t = twint.Config()
t.Search = query
t.Store_object = True
t.Limit = int(count)
t.Near = "europe"
if before:
t.Until = before
if after:
t.Since = after
try:
twint.run.Search(t)
except:
pass
results = []
for tweet in twint.output.tweets_list:
mytweet = tweet if keep else {}
try:
mytweet["schema:locationCreated"] = api.get_user(tweet.user_id_str, tweet.username).location
except:
pass
for tweet in scraper.get_items():
mytweet = {}
mytweet["@type"] = ["schema:BlogPosting", ]
mytweet["@id"] = 'https://twitter.com/{screen_name}/status/{id}'.format(screen_name=tweet.username, id=tweet.id)
mytweet["@id"] = tweet.id
mytweet["schema:about"] = query
mytweet["schema:search"] = query
mytweet["schema:articleBody"] = tweet.tweet
mytweet["schema:headline"] = tweet.tweet
mytweet["schema:creator"] = tweet.username
mytweet["schema:articleBody"] = tweet.content
mytweet["schema:headline"] = tweet.content
mytweet["schema:creator"] = tweet.user.username
mytweet["schema:author"] = 'twitter'
mytweet["schema:inLanguage"] = tweet.lang
mytweet["schema:keywords"] = tweet.hashtags
mytweet["schema:datePublished"] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.strptime(tweet.datetime,'%Y-%m-%d %H:%M:%S %Z'))
mytweet["schema:datePublished"] = tweet.date.strftime('%Y-%m-%dT%H:%M:%SZ')
if tweet.reply_to:
mytweet["@type"].append("schema:Comment")
mytweet["schema:parentItem"] = 'https://twitter.com/{screen_name}/status/{id}'.format(screen_name=tweet.reply_to[0]["screen_name"], id=tweet.reply_to[0]["id"])
results.append(mytweet)
if tweet.place:
mytweet["schema:locationCreated"] = tweet.place.fullName
print(json.dumps(results, indent=3))
return results
if tweet.coordinates:
mytweet['location'] = {
'lat': tweet.coordinates.latitude,
'lon': tweet.coordinates.longitude
}
elif library == "tweepy":
mytweet["year"] = tweet.date.strftime('%Y')
tweet_list.append(mytweet)
return tweet_list
else:
consumer_key = CONSUMER_KEY
consumer_secret = CONSUMER_SECRET
......@@ -144,6 +119,3 @@ def retrieveTweets(querytype, query, count=200, keep=False, library="twint", bef
results.append(mytweet)
return results
else:
return [{}]
\ No newline at end of file
......@@ -24,6 +24,8 @@ from gsicrawler.scrapers.elmundo import retrieveElMundoNews
from gsicrawler.scrapers.elpais import retrieveElPaisNews
from gsicrawler.scrapers.tripadvisor import retrieveTripadvisorReviews
from gsicrawler.scrapers.aljazeera import retrieveAlJazeeraNews
from gsicrawler.scrapers.discord import retrieveDiscordMessages
from gsicrawler.scrapers.reddit import retrieveRedditPosts
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
......@@ -110,3 +112,13 @@ def nyt_scraper(query, date):
@crawler
def aljazeera_scraper(query, date):
return retrieveAlJazeeraNews(query, date)
@celery.task
@crawler
def dsicord_scraper(guild_id, before, after, number):
return retrieveDiscordMessages(guild_id, before, after)
@celery.task
@crawler
def reddit_scraper(subreddit, endpoint, after, before, number):
return retrieveRedditPosts(subreddit, endpoint, after, before, number)
\ No newline at end of file
......@@ -131,8 +131,8 @@ paths:
type: string
enum:
- tweepy
- twint
default: twint
- snscrape
default: tweepy
in: query
required: true
description: "Select the library used to retrieve the tweets. Tweepy uses the official Twitter API with the its limitations. Twint uses a different method and doesn' require API keys."
......@@ -389,3 +389,93 @@ paths:
$ref: "#/components/responses/done"
202:
$ref: "#/components/responses/pending"
/scrapers/discord/:
get:
operationId: gsicrawler.controllers.scrapers.discord_scraper
description: Gets all messages from all text channels in the selected Discord guild.
tags:
- scrapers
parameters:
- name: guild_id
schema:
type: integer
in: query
description: ID of the guild which messages are going to be retreived.
required: true
- name: before
schema:
type: string
default: ""
in: query
description: "Search for messages up until this date. Date format is '2021-01-31'."
- name: after
schema:
type: string
default: ""
in: query
description: "Search for messages since this date. Date format is '2021-01-31'."
- name: number
schema:
type: integer
nullable: true
default: 100
in: query
description: Number of results wanted. If None, all are retrieved.
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/doctype'
- $ref: '#/components/parameters/timeout'
responses:
200:
$ref: "#/components/responses/done"
202:
$ref: "#/components/responses/pending"
/scrapers/reddit/:
get:
operationId: gsicrawler.controllers.scrapers.reddit_scraper
description: Gets all comments/submissions from a subreddit.
tags:
- scrapers
parameters:
- name: subreddit
schema:
type: string
in: query
description: Name of the subreddit to search in.
required: true
- name: endpoint
schema:
type: string
default: comments
in: query
description: Choose between comments and submissions.
- name: before
schema:
type: string
default: ""
in: query
description: "Search for posts up until this date. Date format is '2021-01-31'."
- name: after
schema:
type: string
default: ""
in: query
description: "Search for posts since this date. Date format is '2021-01-31'."
- name: number
schema:
type: integer
nullable: true
default: 100
in: query
description: Number of results wanted. If None, all are retrieved.
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/doctype'
- $ref: '#/components/parameters/timeout'
responses:
200:
$ref: "#/components/responses/done"
202:
$ref: "#/components/responses/pending"
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment