Commit 719a426e authored by Alberto Pascual's avatar Alberto Pascual
Browse files

language detector in analysis, SPARQL filter by language

parent bd081244
Pipeline #448 passed with stages
in 34 seconds
......@@ -175,6 +175,11 @@ class FusekiTask(luigi.Task):
for i, line in enumerate(infile):
self.set_status_message("Lines read: %d" % i)
w = json.loads(line)
#print(w)
try:
w["schema:headline"] = {"@value": w["schema:headline"],"@language": w["language_detected"]}
except:
w["schema:articleBody"] = {"@value": w["schema:articleBody"],"@language": w["language_detected"]}
f.append(w)
f = json.dumps(f)
self.set_status_message("JSON created")
......
......@@ -57,7 +57,7 @@ def semanticAnalysis(i):
for x, index in enumerate(r["entries"][0]["entities"]):
index["nif:beginIndex"] = str(int(index["nif:beginIndex"]) + (REQUEST_LONG*k))
index["nif:endIndex"] = str(int(index["nif:endIndex"]) + (REQUEST_LONG*k))
if index["@type"].split('#')[-1] == 'ODENTITY_City':
if index["@type"] == 'ODENTITY_City':
try:
geor = requests.get("https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=select+%3Fcoordinates+where+%7B%0D%0A%0D%0Adbr%3A{}+georss%3Apoint+%3Fcoordinates%0D%0A%0D%0A%7D&format=application%2Fsparql-results%2Bjson".format(index.get("rdfs:subClassOf", "").split('/')[-1]))
coords = geor.json()['results']['bindings'][0]['coordinates']['value'].split()
......@@ -65,7 +65,7 @@ def semanticAnalysis(i):
index['longitude'] = coords[1]
except (IndexError, json.decoder.JSONDecodeError):
pass
if index["@type"].split('#')[-1] in ['ODENTITY_Person', 'ODENTITY_FullName']:
if index["@type"] in ['ODENTITY_Person', 'ODENTITY_FullName']:
try:
peopler = requests.get("https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=select+%3Fimage+where+%7B%0D%0A++dbr%3A{}+++dbo%3Athumbnail+%3Fimage%0D%0A%0D%0A%7D+LIMIT+100&format=application%2Fsparql-results%2Bjson&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+".format(index.get("rdfs:subClassOf", "").split('/')[-1]))
index['dbo:thumbnail'] = peopler.json()['results']['bindings'][0]['image']['value']
......@@ -95,4 +95,8 @@ def semanticAnalysis(i):
i["sentiments"] = sentiments_arr
i["entities"] = entities_arr
i["topics"] = topics_arr
try:
i["language_detected"] = r["entries"][0]["language_detected"]
except:
i["language_detected"] = "en"
return i
No preview for this file type
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment