Commit 038f6791 authored by J. Fernando Sánchez's avatar J. Fernando Sánchez
Browse files

Support multiple ES versions

parent 54f22e1f
......@@ -8,13 +8,13 @@ from functools import wraps
def scraper(source):
def outer(func):
@wraps(func)
def inner(esendpoint=None, index='gsicrawler', timeout=1, **kwargs):
def inner(esendpoint=None, esversion='auto', index='gsicrawler', timeout=1, **kwargs):
if timeout < 0:
timeout = None
response = {'parameters': kwargs, 'source': source}
try:
task = func().delay(esendpoint=esendpoint, index=index, **kwargs)
task = func().delay(esendpoint=esendpoint, esversion=esversion, index=index, **kwargs)
response['task_id'] = task.id
results = task.get(timeout=timeout)
response['results'] = results
......
......@@ -27,8 +27,11 @@ from gsicrawler.scrapers.tripadvisor import retrieveTripadvisorReviews
from gsicrawler.scrapers.aljazeera import retrieveAlJazeeraNews
from gsicrawler.scrapers.discord import retrieveDiscordMessages
from gsicrawler.scrapers.reddit import retrieveRedditPosts
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import elasticsearch
import elasticsearch6
import elasticsearch7
import elasticsearch8
logger = get_task_logger(__name__)
......@@ -50,7 +53,7 @@ def test_task():
def crawler(func):
@wraps(func)
def func_wrapper(output, esendpoint, index, number, **kwargs):
def func_wrapper(output, esendpoint, esversion, index, number, **kwargs):
print(kwargs)
filepath = "/tmp/"+str(time.time())+".json"
print(filepath)
......@@ -61,9 +64,20 @@ def crawler(func):
number = None
result = islice(func(**kwargs), number)
args = {'hosts': [esendpoint]}
if esversion == 'auto':
es = elasticsearch.Elasticsearch(**args)
elif esversion == '6':
es = elasticsearch6.Elasticsearch(doc_type="tweet")
elif esversion == '7':
es = elasticsearch7.Elasticsearch(**args)
elif esversion == '8':
es = elasticsearch8.Elasticsearch(**args)
else:
raise Exception("ES version not supported: {}".format(esversion))
if (output == "elasticsearch"):
es = Elasticsearch(hosts=[esendpoint])
for doc in result:
id = doc['@id']
print('Storing {}'.format(id))
......
......@@ -57,6 +57,18 @@ components:
required: false
schema:
type: string
esversion:
name: esversion
in: query
description: elasticsearch version to use. Set to auto to use the latest elasticsearch client.
required: false
schema:
type: string
enum:
- '7'
- '8'
- auto
default: auto
index:
name: index
in: query
......@@ -150,6 +162,7 @@ paths:
description: "Search for tweets since this date. Only works for the Twint library. Date format is '2021-01-31'."
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/esversion'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/number'
......@@ -173,6 +186,7 @@ paths:
required: true
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/esversion'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/number'
......@@ -196,6 +210,7 @@ paths:
required: true
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/esversion'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/number'
......@@ -225,6 +240,7 @@ paths:
required: true
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/esversion'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/timeout'
responses:
......@@ -248,6 +264,7 @@ paths:
description: Number of results wanted
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/esversion'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/number'
......@@ -278,6 +295,7 @@ paths:
required: true
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/esversion'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/number'
......@@ -308,6 +326,7 @@ paths:
required: true
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/esversion'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/timeout'
responses:
......@@ -342,6 +361,7 @@ paths:
description: "Search for messages since this date. Date format is '2021-01-31'."
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/esversion'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/number'
......@@ -383,6 +403,7 @@ paths:
description: "Search for posts since this date. Date format is '2021-01-31'."
- $ref: '#/components/parameters/output'
- $ref: '#/components/parameters/esendpoint'
- $ref: '#/components/parameters/esversion'
- $ref: '#/components/parameters/index'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/number'
......
......@@ -7,17 +7,7 @@ metadata:
ingress.kubernetes.io/rewrite-target: /
spec:
rules:
- host: crawler.social.cluster.gsi.dit.upm.es
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: gsicrawler-web
port:
number: 5000
- host: crawler.gsi.upm.es
- host: crawler-demos.gsi.upm.es
http:
paths:
- path: /
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment