Commit acbc7b7e authored by Alberto Pascual's avatar Alberto Pascual
Browse files

aljazeera added and articles

parent 45fa31bb
......@@ -15,7 +15,11 @@ import subprocess
from scrapers.cnnScraper import retrieveCnnNews
from scrapers.nytimesScraper import retrieveNytimesNews
from scrapers.twitter import retrieve_tweets
from scrapers.aljazeeraScraper import retrieveAlJazeeraNews
from analyzers.analysis import expertAnalysis
from analyzers.analysis import semanticAnalysis
from analyzers.analysis import myAnalysis
ES_ENDPOINT = os.environ.get('ES_ENDPOINT')
ES_PORT = os.environ.get('ES_PORT')
......@@ -23,7 +27,7 @@ FUSEKI_PORT = os.environ.get('FUSEKI_PORT')
FUSEKI_ENDPOINT = os.environ.get('FUSEKI_ENDPOINT')
print('ES connection: {} : {}'.format(ES_ENDPOINT, ES_PORT))
class ScrapyTask(luigi.Task):
class AnalysisTask(luigi.Task):
"""
Generates a local file containing 5 elements of data in JSON format.
"""
......@@ -42,6 +46,7 @@ class ScrapyTask(luigi.Task):
num = luigi.Parameter()
def run(self):
"""
Writes data in JSON format into the task's output target.
......@@ -50,75 +55,107 @@ class ScrapyTask(luigi.Task):
* `text`: the text,
* `date`: the day when the data was created.
"""
#today = datetime.date.today()
print(self.analysisType)
"""
with open('dabiq_texts.txt') as infile:
with self.output().open('w') as output:
for _ in infile:
print(_)
i = expertAnalysis(_)
output.write(json.dumps(i))
output.write('\n')
lines = sum(1 for _ in infile)
#lines = len(infile.readlines())
print(infile)
print(lines)
"""
filePath = '/tmp/_scrapy-%s.json' % self.id
#scraperImported = imp.load_source(self.website, 'scrapers/%s.py' % (self.website))
#scraperImported.startScraping(self.url, filePath)
print(self.url, filePath)
retrieveCnnNews(self.url, self.num, filePath)
retrieveNytimesNews(self.url, self.num, filePath)
retrieve_tweets(self.url, filePath, self.num)
def output(self):
print("Analizing")
print(filePath)
print(self.url)
print(self.num)
cnn = retrieveCnnNews(self.url, 5, filePath)
nyt = retrieveNytimesNews(self.url, 5, filePath)
alj = retrieveAlJazeeraNews(self.url, 5, filePath)
"""
Returns the target output for this task.
In this case, a successful execution of this task will create a file on the local filesystem.
:return: the target output for this task.
:rtype: object (:py:class:`luigi.target.Target`)
#print("aljazeera")
#print(alj)
"""
return luigi.LocalTarget(path='/tmp/_scrapy-%s.json' % self.id)
class AnalysisTask(luigi.Task):
"""
Generates a local file containing 5 elements of data in JSON format.
"""
#: the date parameter.
#date = luigi.DateParameter(default=datetime.date.today())
#field = str(random.randint(0,10000)) + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
url = luigi.Parameter()
id = luigi.Parameter()
dabiqarticles = json.load(open('dabiqarticles.json'))
rumiyaharticles = json.load(open('rumiyaharticles.json'))
tweets = retrieve_tweets("islamic state", filePath, 5)
with self.output().open('w') as output:
analysisType = luigi.Parameter()
for article in dabiqarticles:
#print(article)
if article["@type"] == "schema:Article":
print("inside")
try:
i = myAnalysis(article)
output.write(json.dumps(i))
output.write('\n')
except:
pass
for article in rumiyaharticles:
#print(article)
if article["@type"] == "schema:Article":
print("inside")
try:
i = myAnalysis(article)
output.write(json.dumps(i))
output.write('\n')
except:
pass
for newsitem in alj:
i = myAnalysis(newsitem)
output.write(json.dumps(i))
output.write('\n')
for newsitem in cnn:
i = myAnalysis(newsitem)
output.write(json.dumps(i))
output.write('\n')
for newsitem in nyt:
i = myAnalysis(newsitem)
output.write(json.dumps(i))
output.write('\n')
num = luigi.Parameter()
for tweet in tweets:
i = myAnalysis(tweet)
output.write(json.dumps(i))
output.write('\n')
def requires(self):
"""
This task's dependencies:
* :py:class:`~.SenpyTask`
:return: object (:py:class:`luigi.task.Task`)
"""
for tweet in tweets:
i = semanticAnalysis(tweet)
output.write(json.dumps(i))
output.write('\n')
"""
"""
return ScrapyTask(self.url, self.id, self.analysisType, self.num)
#retrieveNytimesNews(self.url, self.num, filePath)
#retrieve_tweets(self.url, filePath, self.num)
def run(self):
"""
Writes data in JSON format into the task's output target.
The data objects have the following attributes:
* `_id` is the default Elasticsearch id field,
* `text`: the text,
* `date`: the day when the data was created.
"""
articles = json.load(open('blank.json'))
with self.output().open('w') as output:
with self.input().open('r') as infile:
lines = sum(1 for _ in infile)
for j,line in enumerate(infile):
i = json.loads(line)
progress = (j*100)/lines
self.set_status_message("Progress %d%" % progress)
i = semanticAnalysis(i)
for article in articles:
print(article)
if article["@type"][0] == "http://schema.org/Article":
print("inside")
i = semanticAnalysis(article)
output.write(json.dumps(i))
output.write('\n')
"""
def output(self):
"""
......
......@@ -10,7 +10,7 @@ import time
API_KEY_MEANING_CLOUD = os.environ.get('API_KEY_MEANING_CLOUD')
def getContext():
r = requests.get("http://latest.senpy.cluster.gsi.dit.upm.es/api/contexts/Context.jsonld")
r = requests.get("http://senpy.cluster.gsi.dit.upm.es/api/contexts/Context.jsonld")
senpy_context = r.json()["@context"]
senpy_context.update({
'dbps':'http://www.openlinksw.com/schemas/dbpedia-spotlight#',
......@@ -38,15 +38,16 @@ def semanticAnalysis(i):
REQUEST_LONG = 3000
i_len = len(i["schema:articleBody"])
number_of_requests = (len(i["schema:articleBody"])//REQUEST_LONG)
i['_id'] = i['@id']
entities_arr = []
sentiments_arr = []
topics_arr = []
for k in range(0,number_of_requests+1):
if i_len - int(REQUEST_LONG*(k+1)) > 0:
r = requests.post('http://meaningcloud.senpy.cluster.gsi.dit.upm.es/api/', data={'algo':'sentiment-meaningCloud', 'apiKey':API_KEY_MEANING_CLOUD, 'i':i["schema:articleBody"][REQUEST_LONG*k:REQUEST_LONG*k+REQUEST_LONG]})
r = requests.post('http://senpy:5000/api/', data={'algo':'sentiment140', 'apiKey':API_KEY_MEANING_CLOUD, 'i':i["http://schema.org/articleBody"][0]["@value"][REQUEST_LONG*k:REQUEST_LONG*k+REQUEST_LONG]})
else:
r = requests.post('http://meaningcloud.senpy.cluster.gsi.dit.upm.es/api/', data={'algo':'sentiment-meaningCloud', 'apiKey':API_KEY_MEANING_CLOUD, 'i':i["schema:articleBody"][REQUEST_LONG*k:-1]})
r = requests.post('http://senpy:5000/api/', data={'algo':'sentiment140', 'apiKey':API_KEY_MEANING_CLOUD, 'i':i["http://schema.org/articleBody"][0]["@value"][REQUEST_LONG*k:-1]})
time.sleep(1)
r = r.json()
......@@ -59,7 +60,7 @@ def semanticAnalysis(i):
index["nif:endIndex"] = str(int(index["nif:endIndex"]) + (REQUEST_LONG*k))
if index["@type"].split('#')[-1] == 'ODENTITY_City':
try:
geor = requests.get("https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=select+%3Fcoordinates+where+%7B%0D%0A%0D%0Adbr%3A{}+georss%3Apoint+%3Fcoordinates%0D%0A%0D%0A%7D&format=application%2Fsparql-results%2Bjson".format(index.get("rdfs:subClassOf", "").split('/')[-1]))
geor = requests.get("https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=select+%3Fcoordinates+where+%7B%0D%0A%0D%0Adbr%3A{}+georss%3Apoint+%3Fcoordinates%0D%0A%0D%0A%7D&format=application%2Fsparql-results%2Bjson".format(index.get("marl:describesObject", "").split('/')[-1]))
coords = geor.json()['results']['bindings'][0]['coordinates']['value'].split()
index['latitude'] = coords[0]
index['longitude'] = coords[1]
......@@ -67,7 +68,7 @@ def semanticAnalysis(i):
pass
if index["@type"].split('#')[-1] in ['ODENTITY_Person', 'ODENTITY_FullName']:
try:
peopler = requests.get("https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=select+%3Fimage+where+%7B%0D%0A++dbr%3A{}+++dbo%3Athumbnail+%3Fimage%0D%0A%0D%0A%7D+LIMIT+100&format=application%2Fsparql-results%2Bjson&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+".format(index.get("rdfs:subClassOf", "").split('/')[-1]))
peopler = requests.get("https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=select+%3Fimage+where+%7B%0D%0A++dbr%3A{}+++dbo%3Athumbnail+%3Fimage%0D%0A%0D%0A%7D+LIMIT+100&format=application%2Fsparql-results%2Bjson&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+".format(index.get("marl:describesObject", "").split('/')[-1]))
index['dbo:thumbnail'] = peopler.json()['results']['bindings'][0]['image']['value']
except (IndexError, json.decoder.JSONDecodeError):
pass
......@@ -87,6 +88,7 @@ def semanticAnalysis(i):
r["entries"][0]["topics"] = [r["entries"][0]["topics"]]
for x, index in enumerate(r["entries"][0]["topics"]):
index["@id"] = i["@id"]+"#Topic{num}".format(num=x)
index["rdfs:subClassOf"] = "http://dbpedia.org/resource/Internet"
topics_arr.append(index)
......@@ -95,3 +97,257 @@ def semanticAnalysis(i):
i["entities"] = entities_arr
i["topics"] = topics_arr
return i
def expertAnalysis(entry):
text = entry
result = {}
result["text"] = text
data = {"DOCUMENT":text}
headers = {'Content-Type': 'application/json', 'Accept': 'application/json'}
url_cat = "http://trivalent.expertsystemlab.com/text/rest/categorize"
res_cat = requests.post(url_cat, data=json.dumps(data), headers=headers)
#entry['categorize'] = json.loads(res_cat.text)
url_info = "http://trivalent.expertsystemlab.com/text/rest/extract-info"
res_info = json.loads(requests.post(url_info, data=json.dumps(data), headers=headers).text)
try:
organization_names = []
organizations = res_info["RESPONSE"]["ORGANIZATIONS"]
if isinstance(organizations["ORGANIZATION"],dict):
organization = organizations["ORGANIZATION"]["BASE"]
aux = {"@type": "schema:Organization",
"schema:name": organization}
organization_names.append(aux)
elif len(organizations["ORGANIZATION"]) > 1:
organizations_ = [x["BASE"] for x in organizations["ORGANIZATION"]]
organization_names = []
for organization in organizations_:
aux = {"@type": "schema:Organization",
"schema:name": organization}
organization_names.append(aux)
except:
print("organizations")
organization_names = []
try:
people_names = []
people = res_info["RESPONSE"]["PEOPLE"]
if isinstance(people["PERSON"],dict):
person = people["PERSON"]["BASE"]
aux = {"@type": "schema:Person",
"schema:name": person}
people_names.append(aux)
elif len(people["PERSON"]) > 1:
people_ = [x["BASE"] for x in people["PERSON"]]
for person in people_:
aux = {"@type": "schema:Person",
"schema:name": person}
people_names.append(aux)
except:
people_names = []
print("people")
try:
place_names = []
places = res_info["RESPONSE"]["PLACES"]
if isinstance(places["PLACE"], dict):
place = places["PLACE"]["BASE"]
aux = {"@type": "schema:Place",
"schema:name": place}
place_names.append(aux)
elif len(places["PLACE"]) > 1:
places_ = [x["BASE"] for x in places["PLACE"]]
for place in places_:
aux = {"@type": "schema:Place",
"schema:name": place}
place_names.append(aux)
except:
place_names = []
print("places")
result['organizations'] = organization_names
result['people'] = people_names
result['places'] = place_names
#entry['info'] = res_info
return result
def myAnalysis(i):
i["@context"] = getContext()
REQUEST_LONG = 3000
i_len = len(i["schema:articleBody"])
number_of_requests = (len(i["schema:articleBody"])//REQUEST_LONG)
i['_id'] = i['@id']
key = "AIzaSyDxZkoTU0IDBZmw6q3-5P6VsZ7cfhiTvcY"
entities_arr = []
sentiments_arr = []
for k in range(0,number_of_requests+1):
if i_len - int(REQUEST_LONG*(k+1)) > 0:
r = requests.post('http://senpy:5000/api/', data={'algo':'sentiment140', 'apiKey':API_KEY_MEANING_CLOUD, 'i':i["schema:articleBody"][REQUEST_LONG*k:REQUEST_LONG*k+REQUEST_LONG]})
else:
r = requests.post('http://senpy:5000/api/', data={'algo':'sentiment140', 'apiKey':API_KEY_MEANING_CLOUD, 'i':i["schema:articleBody"][REQUEST_LONG*k:-1]})
time.sleep(1)
r = r.json()
if not 'entries' in r:
continue
if type(r["entries"][0]["sentiments"]) is dict:
r["entries"][0]["sentiments"] = [r["entries"][0]["sentiments"]]
for x, index in enumerate(r["entries"][0]["sentiments"]):
index["@id"] = i["@id"]+"#Sentiment{num}".format(num=x)
if 'nif:beginIndex' in index:
index["nif:beginIndex"] = str(int(index["nif:beginIndex"]) + (REQUEST_LONG*k))
if 'nif:endIndex' in index:
index["nif:endIndex"] = str(int(index["nif:endIndex"]) + (REQUEST_LONG*k))
sentiments_arr.append(index)
data = {"DOCUMENT":i["schema:articleBody"]}
headers = {'Content-Type': 'application/json', 'Accept': 'application/json'}
url_cat = "http://trivalent.expertsystemlab.com/text/rest/categorize"
res_cat = requests.post(url_cat, data=json.dumps(data), headers=headers)
#entry['categorize'] = json.loads(res_cat.text)
url_info = "http://trivalent.expertsystemlab.com/text/rest/extract-info"
res_info = json.loads(requests.post(url_info, data=json.dumps(data), headers=headers).text)
try:
people_names = []
people = res_info["RESPONSE"]["PEOPLE"]
if isinstance(people["PERSON"],dict):
person = people["PERSON"]["BASE"]
dbpedia = requests.get('http://model.dbpedia-spotlight.org/en/annotate?text=%s&confidence=0.2&support=20' % person, headers={"Accept":"application/json"}).json()
resp_arr = dbpedia["Resources"]
final_person = resp_arr[0]['@URI']
peopler = requests.get("https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=select+%3Fimage+where+%7B%0D%0A++dbr%3A{}+++dbo%3Athumbnail+%3Fimage%0D%0A%0D%0A%7D+LIMIT+100&format=application%2Fsparql-results%2Bjson&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+".format(final_person.split('/')[-1]))
thumbnail = peopler.json()['results']['bindings'][0]['image']['value']
aux = {"@type": "schema:Person",
"@id": final_person,
"schema:name": person,
"schema:image": thumbnail }
entities_arr.append(aux)
elif len(people["PERSON"]) > 1:
people_ = [x["BASE"] for x in people["PERSON"]]
for person in people_:
dbpedia = requests.get('http://model.dbpedia-spotlight.org/en/annotate?text=%s&confidence=0.2&support=20' % person, headers={"Accept":"application/json"}).json()
resp_arr = dbpedia["Resources"]
final_person = resp_arr[0]['@URI']
peopler = requests.get("https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=select+%3Fimage+where+%7B%0D%0A++dbr%3A{}+++dbo%3Athumbnail+%3Fimage%0D%0A%0D%0A%7D+LIMIT+100&format=application%2Fsparql-results%2Bjson&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+".format(final_person.split('/')[-1]))
thumbnail = peopler.json()['results']['bindings'][0]['image']['value']
aux = {"@type": "schema:Person",
"@id": final_person,
"schema:name": person,
"schema:image": thumbnail }
entities_arr.append(aux)
except:
pass
try:
place_names = []
places = res_info["RESPONSE"]["PLACES"]
if isinstance(places["PLACE"], dict):
place = places["PLACE"]["BASE"]
dbpedia = requests.get('http://model.dbpedia-spotlight.org/en/annotate?text=%s&confidence=0.2&support=20' % place, headers={"Accept":"application/json"}).json()
resp_arr = dbpedia["Resources"]
final_place = resp_arr[0]['@URI']
place_query = requests.get('https://maps.googleapis.com/maps/api/geocode/json?address=' + place + '&key=' + key)
resp_json_payload = place_query.json()
resp = resp_json_payload['results'][0]['geometry']['location']
lat = resp['lat']
lon = resp['lng']
aux = {"@type": "schema:Place",
"@id": final_place,
"schema:name": place,
"schema:geo": { "@type": "schema:GeoCoordinates", "schema:latitude": lat, "schema:longitude": lon}
}
entities_arr.append(aux)
elif len(places["PLACE"]) > 1:
places_ = [x["BASE"] for x in places["PLACE"]]
for place in places_:
dbpedia = requests.get('http://model.dbpedia-spotlight.org/en/annotate?text=%s&confidence=0.2&support=20' % place, headers={"Accept":"application/json"}).json()
resp_arr = dbpedia["Resources"]
final_place = resp_arr[0]['@URI']
place_query = requests.get('https://maps.googleapis.com/maps/api/geocode/json?address=' + place + '&key=' + key)
resp_json_payload = place_query.json()
resp = resp_json_payload['results'][0]['geometry']['location']
lat = resp['lat']
lon = resp['lng']
aux = {"@type": "schema:Place",
"@id": final_place,
"schema:name": place,
"schema:geo": { "@type": "schema:GeoCoordinates", "schema:latitude": lat, "schema:longitude": lon}
}
entities_arr.append(aux)
except Exception as e:
print("fail")
print(e)
print("/fail")
try:
organization_names = []
organizations = res_info["RESPONSE"]["ORGANIZATIONS"]
if isinstance(organizations["ORGANIZATION"],dict):
organization = organizations["ORGANIZATION"]["BASE"]
aux = {"rdfs:subClassOf": "http://dbpedia.org/resource/" + organization,
"nif:anchorOf": organization}
entities_arr.append(aux)
elif len(organizations["ORGANIZATION"]) > 1:
organizations_ = [x["BASE"] for x in organizations["ORGANIZATION"]]
organization_names = []
for organization in organizations_:
aux = { "@type": "schema:Organization",
"@id": "http://dbpedia.org/resource/" + organization,
"schema:name": organization}
entities_arr.append(aux)
except:
print("organizations")
organization_names = []
"""
if type(r["entries"][0]["entities"]) is dict:
r["entries"][0]["entities"] = [r["entries"][0]["entities"]]
for x, index in enumerate(r["entries"][0]["entities"]):
index["nif:beginIndex"] = str(int(index["nif:beginIndex"]) + (REQUEST_LONG*k))
index["nif:endIndex"] = str(int(index["nif:endIndex"]) + (REQUEST_LONG*k))
if index["@type"].split('#')[-1] == 'ODENTITY_City':
try:
geor = requests.get("https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=select+%3Fcoordinates+where+%7B%0D%0A%0D%0Adbr%3A{}+georss%3Apoint+%3Fcoordinates%0D%0A%0D%0A%7D&format=application%2Fsparql-results%2Bjson".format(index.get("marl:describesObject", "").split('/')[-1]))
coords = geor.json()['results']['bindings'][0]['coordinates']['value'].split()
index['latitude'] = coords[0]
index['longitude'] = coords[1]
except (IndexError, json.decoder.JSONDecodeError):
pass
if index["@type"].split('#')[-1] in ['ODENTITY_Person', 'ODENTITY_FullName']:
try:
peopler = requests.get("https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=select+%3Fimage+where+%7B%0D%0A++dbr%3A{}+++dbo%3Athumbnail+%3Fimage%0D%0A%0D%0A%7D+LIMIT+100&format=application%2Fsparql-results%2Bjson&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+".format(index.get("marl:describesObject", "").split('/')[-1]))
index['dbo:thumbnail'] = peopler.json()['results']['bindings'][0]['image']['value']
except (IndexError, json.decoder.JSONDecodeError):
pass
entities_arr.append(index)
"""
i["sentiments"] = sentiments_arr
i["entities"] = entities_arr
return i
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -14,14 +14,27 @@ services:
- "http.cors.enabled=true"
- 'http.cors.allow-origin=*'
volumes:
- esdata:/usr/share/elasticsearch/data/
- testelastic:/usr/share/elasticsearch/data/
ports:
- 19200:9200
- 19300:9300
networks:
- sefarad-network
#senpy:
#image: gsiupm/sentiment-meaningcloud:0.1.7-python3.5
#ports:
# - "5000:5000"
#networks:
# - sefarad-network
senpy:
image: gsiupm/senpy
command: --default-plugins
ports:
- "8000:5000"
networks:
- sefarad-network
fuseki:
image: stain/jena-fuseki
......@@ -58,32 +71,10 @@ services:
depends_on:
- elasticsearch
- fuseki
web:
image: gsiupm/dashboard-gsicrawler:0.2.6
ports:
- "8080:8080"
environment:
- FUSEKI_ENDPOINT_EXTERNAL=${FUSEKI_ENDPOINT_EXTERNAL}
- ES_ENDPOINT_EXTERNAL=${ES_ENDPOINT_EXTERNAL}
depends_on:
- fuseki
- elasticsearch
dashboard:
build: ./demodashboard/
volumes:
- ./demodashboard:/usr/src/app
ports:
- "8090:8080"
environment:
- ES_ENDPOINT_EXTERNAL=${ES_ENDPOINT_EXTERNAL}
depends_on:
- fuseki
- elasticsearch
networks:
sefarad-network:
driver: bridge
volumes:
esdata:
testelastic:
This diff is collapsed.
This diff is collapsed.
import urllib.request
import re
from newspaper import Article
import html.parser as htmlparser
import requests
parser = htmlparser.HTMLParser()
def retrieveAlJazeeraNews(search, num, filepath):
#search = "isis"
#results = 10
url = "https://ajnsearch.aljazeera.com/SearchProxy.aspx?m=search&c=english&f=AJE_BS&s=as_q&q=" + search + "&p=0&r=" + str(num) + "&o=any&t=d&cnt=gsaSearch&target=gsaSearch"
def clean(text):
text = text.replace(u'\\u0026#39;','\'')
text = text.replace(u'\\','')
return(text)
content = urllib.request.urlopen(url).read()
tokens = str(content).split()
urls = []
headlines = []
flag = 0
aux_ = []
#Headline extraction
for word in tokens: