Commit 79e94aac authored by J. Fernando Sánchez's avatar J. Fernando Sánchez
Browse files

Fix twitter and load into ES from the beginning

* Limit number of results for every scraper
* Scrapers can now be generators. Entries will be stored into ES as they are
yielded
parent 8eaa10d9
......@@ -8,11 +8,13 @@ from functools import wraps
def scraper(source):
def outer(func):
@wraps(func)
def inner(esendpoint=None, doctype=source, index='gsicrawler', timeout=1, **kwargs):
def inner(esendpoint=None, index='gsicrawler', timeout=1, **kwargs):
if timeout < 0:
timeout = None
response = {'parameters': kwargs, 'source': source}
try:
task = func().delay(esendpoint=esendpoint, doctype=doctype, index=index, **kwargs)
task = func().delay(esendpoint=esendpoint, index=index, **kwargs)
response['task_id'] = task.id
results = task.get(timeout=timeout)
response['results'] = results
......@@ -63,8 +65,8 @@ def aljazeera_scraper(**kwargs):
@scraper('Discord')
def discord_scraper(**kwargs):
return current_app.tasks.dsicord_scraper
return current_app.tasks.discord_scraper
@scraper('Reddit')
def reddit_scraper(**kwargs):
return current_app.tasks.reddit_scraper
\ No newline at end of file
return current_app.tasks.reddit_scraper
......@@ -4,7 +4,7 @@ import datetime
import asyncio
import json
async def getMessages(guild_id, before=None, after=None, number=100):
async def getMessages(guild_id, before=None, after=None):
# Create Disocrd client using token
client = discord.Client()
......@@ -26,7 +26,7 @@ async def getMessages(guild_id, before=None, after=None, number=100):
msgs = []
for channel in txt_channels:
try:
msgs += await channel.history(limit=number, after=after_dt, before=before_dt).flatten()
msgs += await channel.history(after=after_dt, before=before_dt).flatten()
except:
pass
......@@ -57,4 +57,4 @@ def retrieveDiscordMessages(guild_id, before=None, after=None):
print(json.dumps(messages, indent=3))
return messages
\ No newline at end of file
return messages
......@@ -9,7 +9,7 @@ import time
from newspaper import Article
from bs4 import BeautifulSoup
def retrieveElPaisNews(search, num):
def retrieveElPaisNews(search):
s = requests.Session()
......@@ -52,4 +52,4 @@ def retrieveElPaisNews(search, num):
results.append(newsitem)
except:
pass
return results
\ No newline at end of file
return results
......@@ -32,19 +32,16 @@ def timeline(api, query, count):
return api.user_timeline(screen_name=query, count=count)
def retrieveTweets(querytype, query, count=0, keep=False, library="tweepy", before=None, after=None):
def retrieveTweets(querytype, query, count=0, keep=False, library="snscrape", before=None, after=None):
if library == "snscrape":
full_query = '{}'
full_query = '{}'.format(query)
if before:
full_query = full_query + ' since:{}'.format(before)
full_query = full_query + ' until:{}'.format(before)
if after:
full_query = full_query + ' after:{}'.format(after)
full_query = full_query + ' since:{}'.format(after)
scraper = twitter.TwitterSearchScraper(full_query)
tweet_list = []
for tweet in scraper.get_items():
mytweet = {}
mytweet["@type"] = ["schema:BlogPosting", ]
mytweet["@id"] = tweet.id
......@@ -68,8 +65,7 @@ def retrieveTweets(querytype, query, count=0, keep=False, library="tweepy", befo
}
mytweet["year"] = tweet.date.strftime('%Y')
tweet_list.append(mytweet)
return tweet_list
yield mytweet
else:
......@@ -95,7 +91,6 @@ def retrieveTweets(querytype, query, count=0, keep=False, library="tweepy", befo
raise Exception('Unknown query type')
results = []
for item in found:
jsontweet = json.dumps(item._json)
tweet = json.loads(jsontweet)
......@@ -120,6 +115,4 @@ def retrieveTweets(querytype, query, count=0, keep=False, library="tweepy", befo
mytweet["@type"].append("schema:Comment")
mytweet["schema:parentItem"] = 'https://twitter.com/{screen_name}/status/{id}'.format(screen_name=tweet["in_reply_to_screen_name"], id=tweet["in_reply_to_status_id"])
results.append(mytweet)
return results
yield mytweet
......@@ -3,6 +3,7 @@ from celery.decorators import periodic_task
from celery.schedules import crontab
from functools import wraps
from itertools import islice
import json
import os
......@@ -49,44 +50,46 @@ def test_task():
def crawler(func):
@wraps(func)
def func_wrapper(output, esendpoint, index, doctype, **kwargs):
def func_wrapper(output, esendpoint, index, number, **kwargs):
print(kwargs)
filepath = "/tmp/"+str(time.time())+".json"
print(filepath)
print("Scraping...")
result = func(**kwargs)
print(len(result))
print("Starting scraper...")
if number < 0:
number = None
result = islice(func(**kwargs), number)
if (output == "elasticsearch"):
es = Elasticsearch(hosts=[esendpoint])
for doc in result:
id = doc['@id']
print('Storing {}'.format(id))
res = es.index(index=index, doc_type=doctype, id=id, body=doc)
if (res['result']!='created'):
print(res['result'])
res = es.index(index=index, id=id, body=doc)
return "Check your results at: "+esendpoint+"/"+index+"/_search"
else:
result = list(result)
print(len(result))
return result
return func_wrapper
@celery.task
@crawler
def twitter_scraper(query, querytype, number, keep, library, before, after):
return retrieveTweets(querytype, query, number, keep, library, before, after)
def twitter_scraper(query, querytype, keep, library, before, after):
return retrieveTweets(querytype=querytype, query=query, keep=keep, library=library, before=before, after=after)
@celery.task
@crawler
def tripadvisor_scraper(query, number):
return retrieveTripadvisorReviews(query, number)
def tripadvisor_scraper(query):
return retrieveTripadvisorReviews(query)
@celery.task
@crawler
def facebook_scraper(query, number):
return getFBPageFeedData(query, number)
def facebook_scraper(query):
return getFBPageFeedData(query)
@celery.task
@crawler
......@@ -95,13 +98,13 @@ def cnn_scraper(query,date):
@celery.task
@crawler
def elpais_scraper(query, number):
return retrieveElPaisNews(query, number)
def elpais_scraper(query):
return retrieveElPaisNews(query)
@celery.task
@crawler
def elmundo_scraper(query, number):
return retrieveElMundoNews(query, number)
def elmundo_scraper(query):
return retrieveElMundoNews(query)
@celery.task
@crawler
......@@ -115,10 +118,10 @@ def aljazeera_scraper(query, date):
@celery.task
@crawler
def dsicord_scraper(guild_id, before, after, number):
def discord_scraper(guild_id, before, after):
return retrieveDiscordMessages(guild_id, before, after)
@celery.task
@crawler
def reddit_scraper(subreddit, endpoint, after, before, number):
return retrieveRedditPosts(subreddit, endpoint, after, before, number)
\ No newline at end of file
def reddit_scraper(subreddit, endpoint, after, before):
return retrieveRedditPosts(subreddit, endpoint, after, before)
......@@ -64,21 +64,21 @@ components:
required: false
schema:
type: string
doctype:
name: doctype
number:
name: number
in: query
description: elasticsearch doc_type to store data.
required: false
description: "Number of results to retrieve. Set to 0 or below to retrieve all available reviews."
schema:
type: string
type: integer
default: 10
timeout:
name: timeout
in: query
description: seconds to wait before returning the task information.
description: seconds to wait before returning the task information. Set to a negative number to wait synchronously.
required: false
schema:
type: integer
default: 1
default: -1
paths:
/tasks/{taskId}:
get:
......@@ -132,7 +132,7 @@ paths:
enum:
- tweepy
- snscrape
default: tweepy
default: snscrape
in: query
required: true
description: "Select the library used to retrieve the tweets. Tweepy uses the official Twitter API with the its limitations. Twint uses a different method and doesn' require API keys."
......@@ -148,17 +148,11 @@ paths:
default: ""
in: query
description: "Search for tweets since this date. Only works for the Twint library. Date format is '2021-01-31'."
- name: number
schema:
type: integer
default: 10
in: query
description: "Number of reviews to retrieve. Set to 0 or below to retrieve all available reviews."
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/doctype'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/number'
responses:
200:
$ref: "#/components/responses/done"
......@@ -177,17 +171,11 @@ paths:
in: query
description: Restaurants to search about in tripadvisor
required: true
- name: number
schema:
type: integer
default: -1
in: query
description: Number of results wanted
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/doctype'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/number'
responses:
200:
$ref: "#/components/responses/done"
......@@ -206,17 +194,11 @@ paths:
in: query
description: facebook page name to crawl
required: true
- name: number
schema:
type: integer
default: 10
in: query
description: Number of results wanted
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/doctype'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/number'
responses:
200:
$ref: "#/components/responses/done"
......@@ -244,62 +226,12 @@ paths:
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/doctype'
- $ref: '#/components/parameters/timeout'
responses:
200:
$ref: "#/components/responses/done"
202:
$ref: "#/components/responses/pending"
# /scrapers/elpais/:
# get:
# operationId: gsicrawler.controllers.scrapers.elpais_scraper
# description: Run a scraper to search news in ElPais
# deprecated: true
# tags:
# - scrapers
# produces:
# - application/json
# parameters:
# - name: query
# type: string
# in: query
# description: Topic to search news in ElPais
# required: true
# - name: number
# type: integer
# default: 10
# in: query
# description: Number of results wanted
# required: true
# - name: output
# type: string
# enum:
# - json
# - elasticsearch
# in: query
# description: Select the output. If takes long taskId will be returned
# required: true
# - name: esendpoint
# type: string
# in: query
# description: elasticsearch endpoint to store data. host:port
# required: false
# - name: index
# type: string
# in: query
# description: elasticsearch index to store data.
# required: false
# - name: doctype
# type: string
# in: query
# description: elasticsearch doc_type to store data.
# required: false
# responses:
# 200:
# description: Scraper result or task id running in background
/scrapers/elmundo/:
get:
operationId: gsicrawler.controllers.scrapers.elmundo_scraper
......@@ -313,17 +245,12 @@ paths:
in: query
description: Topic to search news in ElMundo
required: true
- name: number
schema:
type: integer
default: 10
in: query
description: Number of results wanted
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/doctype'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/number'
responses:
200:
$ref: "#/components/responses/done"
......@@ -352,8 +279,8 @@ paths:
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/doctype'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/number'
responses:
200:
$ref: "#/components/responses/done"
......@@ -382,7 +309,6 @@ paths:
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/doctype'
- $ref: '#/components/parameters/timeout'
responses:
200:
......@@ -414,18 +340,11 @@ paths:
default: ""
in: query
description: "Search for messages since this date. Date format is '2021-01-31'."
- name: number
schema:
type: integer
nullable: true
default: 100
in: query
description: Number of results wanted. If None, all are retrieved.
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/doctype'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/number'
responses:
200:
$ref: "#/components/responses/done"
......@@ -462,18 +381,11 @@ paths:
default: ""
in: query
description: "Search for posts since this date. Date format is '2021-01-31'."
- name: number
schema:
type: integer
nullable: true
default: 100
in: query
description: Number of results wanted. If None, all are retrieved.
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/doctype'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/number'
responses:
200:
$ref: "#/components/responses/done"
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment