Commit 9d8117eb authored by Alberto Pascual's avatar Alberto Pascual

docs updates, somedi use-case added

parent 54667754
......@@ -26,12 +26,26 @@ services:
ports:
- "3030:3030"
volumes:
- fusekidata:/fuseki
- ./fuseki:/fuseki
environment:
- ADMIN_PASSWORD=fusekisoneti
networks:
- soneti
sefarad:
build: ./somedi-usecase/sefarad/
ports:
- "8080:8080"
environment:
- ES_ENDPOINT_EXTERNAL=http://localhost:9200
- FUSEKI_ENDPOINT_EXTERNAL=localhost:3030
volumes:
- ./somedi-usecase/sefarad/:/usr/src/app
depends_on:
- gsicrawler
networks:
- soneti
senpy:
image: gsiupm/senpy
command: --default-plugins
......@@ -56,8 +70,8 @@ services:
volumes:
- esdata:/usr/share/elasticsearch/data/
ports:
- 19200:9200
- 19300:9300
- 9200:9200
- 9300:9300
networks:
- soneti
......@@ -72,5 +86,4 @@ networks:
volumes:
esdata:
fusekidata:
# Licensed under the terms of http://www.apache.org/licenses/LICENSE-2.0
## Fuseki Server configuration file.
@prefix : <#> .
@prefix fuseki: <http://jena.apache.org/fuseki#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .
[] rdf:type fuseki:Server ;
# Example::
# Server-wide query timeout.
#
# Timeout - server-wide default: milliseconds.
# Format 1: "1000" -- 1 second timeout
# Format 2: "10000,60000" -- 10s timeout to first result,
# then 60s timeout for the rest of query.
#
# See javadoc for ARQ.queryTimeout for details.
# This can also be set on a per dataset basis in the dataset assembler.
#
# ja:context [ ja:cxtName "arq:queryTimeout" ; ja:cxtValue "30000" ] ;
# Add any custom classes you want to load.
# Must have a "public static void init()" method.
# ja:loadClass "your.code.Class" ;
# End triples.
.
@prefix : <http://base/#> .
@prefix tdb: <http://jena.hpl.hp.com/2008/tdb#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix fuseki: <http://jena.apache.org/fuseki#> .
:service_tdb_all a fuseki:Service ;
rdfs:label "TDB default" ;
fuseki:dataset :tdb_dataset_readwrite ;
fuseki:name "default" ;
fuseki:serviceQuery "query" , "sparql" ;
fuseki:serviceReadGraphStore "get" ;
fuseki:serviceReadWriteGraphStore
"data" ;
fuseki:serviceUpdate "update" ;
fuseki:serviceUpload "upload" .
:tdb_dataset_readwrite
a tdb:DatasetTDB ;
tdb:location "/fuseki/databases/default" .
1
\ No newline at end of file
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[main]
# Development
ssl.enabled = false
plainMatcher=org.apache.shiro.authc.credential.SimpleCredentialsMatcher
#iniRealm=org.apache.shiro.realm.text.IniRealm
iniRealm.credentialsMatcher = $plainMatcher
#localhost=org.apache.jena.fuseki.authz.LocalhostFilter
[users]
# Implicitly adds "iniRealm = org.apache.shiro.realm.text.IniRealm"
admin=fusekisoneti
[roles]
[urls]
## Control functions open to anyone
/$/status = anon
/$/ping = anon
## and the rest are restricted
/$/** = authcBasic,user[admin]
## If you want simple, basic authentication user/password
## on the operations,
## 1 - set a password in [users]
## 2 - change the line above to:
## /$/** = authcBasic,user[admin]
## and set a better
## or to allow any access.
##/$/** = anon
# Everything else
/**=anon
{
"tdb.file_mode" : "direct" ,
"tdb.block_size" : 1024 ,
"tdb.block_read_cache_size" : 50 ,
"tdb.block_write_cache_size" : 20 ,
"tdb.node2nodeid_cache_size" : 500 ,
"tdb.nodeid2node_cache_size" : 500 ,
"tdb.node_miss_cache_size" : 100 ,
"tdb.index_node2id" : "node2id" ,
"tdb.index_id2node" : "nodes" ,
"tdb.triple_index_primary" : "SPO" ,
"tdb.triple_indexes" : [
"SPO" ,
"POS" ,
"OSP"
] ,
"tdb.quad_index_primary" : "GSPO" ,
"tdb.quad_indexes" : [
"GSPO" ,
"GPOS" ,
"GOSP" ,
"POSG" ,
"OSPG" ,
"SPOG"
] ,
"tdb.prefix_index_primary" : "GPU" ,
"tdb.prefix_indexes" : [ "GPU" ] ,
"tdb.file_prefix_index" : "prefixIdx" ,
"tdb.file_prefix_nodeid" : "prefix2id" ,
"tdb.file_prefix_id2node" : "prefixes"
}
1
\ No newline at end of file
@prefix : <http://base/#> .
@prefix tdb: <http://jena.hpl.hp.com/2008/tdb#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix fuseki: <http://jena.apache.org/fuseki#> .
:service_tdb_all a fuseki:Service ;
rdfs:label "TDB default" ;
fuseki:dataset :tdb_dataset_readwrite ;
fuseki:name "default" ;
fuseki:serviceQuery "query" , "sparql" ;
fuseki:serviceReadGraphStore "get" ;
fuseki:serviceReadWriteGraphStore
"data" ;
fuseki:serviceUpdate "update" ;
fuseki:serviceUpload "upload" .
:tdb_dataset_readwrite
a tdb:DatasetTDB ;
tdb:location "/fuseki/databases/default" .
# Licensed under the terms of http://www.apache.org/licenses/LICENSE-2.0
@prefix : <#> .
@prefix fuseki: <http://jena.apache.org/fuseki#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix tdb: <http://jena.hpl.hp.com/2008/tdb#> .
@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .
## ---------------------------------------------------------------
## Updatable in-memory dataset.
<#service1> rdf:type fuseki:Service ;
# URI of the dataset -- http://host:port/{NAME}
fuseki:name "{NAME}" ;
fuseki:serviceQuery "sparql" ;
fuseki:serviceQuery "query" ;
fuseki:serviceUpdate "update" ;
fuseki:serviceUpload "upload" ;
fuseki:serviceReadWriteGraphStore "data" ;
fuseki:serviceReadGraphStore "get" ;
fuseki:dataset <#dataset> ;
.
# Transactional, in-memory dataset. Initially empty.
<#dataset> rdf:type ja:DatasetTxnMem .
# Licensed under the terms of http://www.apache.org/licenses/LICENSE-2.0
@prefix : <#> .
@prefix fuseki: <http://jena.apache.org/fuseki#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix tdb: <http://jena.hpl.hp.com/2008/tdb#> .
@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .
## ---------------------------------------------------------------
## Read-only in-memory dataset - used as a default, dummy datasets
<#service1> rdf:type fuseki:Service ;
fuseki:name "" ;
fuseki:serviceQuery "sparql" ;
fuseki:serviceQuery "query" ;
fuseki:serviceReadGraphStore "get" ;
fuseki:dataset <#dataset> ;
.
## In-memory, empty.
<#dataset> rdf:type ja:RDFDataset .
# Licensed under the terms of http://www.apache.org/licenses/LICENSE-2.0
@prefix : <#> .
@prefix fuseki: <http://jena.apache.org/fuseki#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix tdb: <http://jena.hpl.hp.com/2008/tdb#> .
@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .
## ---------------------------------------------------------------
## Updatable TDB dataset with all services enabled.
<#service_tdb_all> rdf:type fuseki:Service ;
rdfs:label "TDB {NAME}" ;
fuseki:name "{NAME}" ;
fuseki:serviceQuery "query" ;
fuseki:serviceQuery "sparql" ;
fuseki:serviceUpdate "update" ;
fuseki:serviceUpload "upload" ;
fuseki:serviceReadWriteGraphStore "data" ;
# A separate read-only graph store endpoint:
fuseki:serviceReadGraphStore "get" ;
fuseki:dataset <#tdb_dataset_readwrite> ;
.
<#tdb_dataset_readwrite> rdf:type tdb:DatasetTDB ;
tdb:location "{FUSEKI_BASE}/databases/{NAME}" ;
##ja:context [ ja:cxtName "arq:queryTimeout" ; ja:cxtValue "3000" ] ;
##tdb:unionDefaultGraph true ;
.
# Licensed under the terms of http://www.apache.org/licenses/LICENSE-2.0
@prefix : <#> .
@prefix fuseki: <http://jena.apache.org/fuseki#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix tdb: <http://jena.hpl.hp.com/2008/tdb#> .
@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .
## ---------------------------------------------------------------
## Updatable TDB dataset with all services enabled.
<#service_tdb_all> rdf:type fuseki:Service ;
rdfs:label "TDB {NAME}" ;
fuseki:name "{NAME}" ;
fuseki:serviceQuery "query" ;
fuseki:serviceQuery "sparql" ;
fuseki:serviceUpdate "update" ;
fuseki:serviceUpload "upload" ;
fuseki:serviceReadWriteGraphStore "data" ;
# A separate read-only graph store endpoint:
fuseki:serviceReadGraphStore "get" ;
fuseki:dataset <#tdb_dataset_readwrite> ;
.
<#tdb_dataset_readwrite> rdf:type tdb:DatasetTDB ;
tdb:location "{DIR}" ;
##tdb:unionDefaultGraph true ;
.
# Licensed under the terms of http://www.apache.org/licenses/LICENSE-2.0
@prefix : <#> .
@prefix fuseki: <http://jena.apache.org/fuseki#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix tdb: <http://jena.hpl.hp.com/2008/tdb#> .
@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .
## ---------------------------------------------------------------
## Updatable TDB dataset im-memory with all services enabled.
<#service_tdb_all> rdf:type fuseki:Service ;
rdfs:label "TDB {NAME}" ;
fuseki:name "{NAME}" ;
fuseki:serviceQuery "query" ;
fuseki:serviceQuery "sparql" ;
fuseki:serviceUpdate "update" ;
fuseki:serviceUpload "upload" ;
fuseki:serviceReadWriteGraphStore "data" ;
# A separate read-only graph store endpoint:
fuseki:serviceReadGraphStore "get" ;
fuseki:dataset <#tdb_dataset_readwrite> ;
.
<#tdb_dataset_readwrite> rdf:type tdb:DatasetTDB ;
tdb:location "--mem--" ;
## tdb:unionDefaultGraph true ;
.
orchestrator @ bff06cc1
Subproject commit 51cbed2230fb278e54d646264578ca2874597f74
Subproject commit bff06cc179745c434e63d06f7a7650a2aa4b6fa1
......@@ -6,6 +6,13 @@
Welcome to Soneti's documentation!
==================================
**Soneti** is a toolkit for **analyzing social media**, such as social networks (e.g. Twitter, Facebook, ...), blogs, YouTube, Newspapers, AppStores, etc..
It obtains data from different sources, in addition it enriches this obtained data by performing different types of automatic analysis. Finally, it allows us to visualize the data obtained in interactive dashboards.
.. figure:: figures/soneti.png
:alt: Soneti overview
.. toctree::
:maxdepth: 2
:caption: Contents:
......
============
Installation
============
===========
Quick start
===========
Soneti's installation is quite easy and in a few steps you can have the toolkit installed and ready to use.
Soneti's services installation is quite easy and in a few steps you can have a demo services working.
First of all it is necessary to clone the GitLab repository:
.. sourcecode:: bash
......@@ -17,11 +17,12 @@ Now images are ready to run:
$ docker-compose up --build
Check Senpy service is working on http://localhost:8000, GSICrawler service is working on http://localhost:5000 and Sefarad is working on http://localhost:8080
This installation offers a basic version of each service:
Also is necessary to initialize Fuseki dataset in order to be able to store some data. Browse to http://localhost:3030/manage.html?tab=new-dataset. User and password required are admin and fusekisoneti respectively.
* **GSICrawler:** This ingestion service demo has CNN, New York Times, ElMundo, Facebook and Twitter as possible sources. This service is available on http://localhost:5000
Name the dataset default and select Persistent option. Click on create dataset button.
* **Senpy**: This analysis service demo has sentiment140 as sentiment analysis plugin and EmoRand as emotion analysis plugin. This service is available on http://localhost:8000/
Now your Soneti toolkit is ready to work.
* **Sefarad**: This visualization demo environment provides a dashboard for Somedi project and is available on http://localhost:8080.
* **Orchestrator**: Luigi provides a web interface to check your workflows status on http://localhost:8082
......@@ -2,22 +2,21 @@
Use cases
=========
In this documentation we are going to show some uses of Soneti toolkit using the orchestrator.
In this documentation we are going to show some uses of Soneti toolkit.
First of all, you need to create a python script named `orchestrator.py` on root folder and add our dependencies:
SOMEDI: Social Media and Digital Interaction Intelligence
---------------------------------------------------------
.. sourcecode:: python
This use case is part of the SOMEDI project. In this use case we are going to track Restaurantes Lateral brand on social media.
import luigi
from luigi.contrib.esindex import CopyToIndex
from orchestrator.SenpyAnalysis import SenpyAnalysis
from orchestrator.GSICrawlerScraper import GSICrawlerScraper
from orchestrator.CopyToFuseki import CopyToFuseki
We are going to describe this use case in different incremental phases.
Use case 1: Use GSICrawler to get some news
-------------------------------------------
I. Use GSICrawler to get tweets and Facebook posts from official accounts
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This use case is going to retrieve the latest news on CNN newspaper about Trump, for doing so is necessary to add some lines to our python script created before:
This phase gets tweets and Facebook posts from official accounts and shows results printed.
Below is the detailed part of the task located on `somedi-usecase/workflow.py`.
.. sourcecode:: python
......@@ -31,19 +30,27 @@ This use case is going to retrieve the latest news on CNN newspaper about Trump,
def output(self):
return luigi.LocalTarget(path='/tmp/_scrapy-%s.json' % self.id)
As shown in the code we select as endpoint our GSICrawler service and other parameters are going to be given by command line.
As shown in the code we select as endpoint our GSICrawler demo service and other parameters are going to be given by command line.
Run the orchestrator:
Run the orchestrator's workflow to retrieve the 10 latests tweets:
.. sourcecode:: bash
$ docker-compose exec orchestrator python -m luigi --module orchestrator ScrapyTask --query Trump --number 10 --source cnn --id 1
$ docker-compose exec orchestrator python -m luigi --module somedi-usecase.workflow ScrapyTask --query rest_lateral --number 10 --source twitter --id 1
Now run the orchestrator's workflow to retrieve the 10 latests facebook posts, the query must be the official account name on Facebook without @:
.. sourcecode:: bash
$ docker-compose exec orchestrator python -m luigi --module somedi-usecase.workflow ScrapyTask --query restauranteslateral --number 10 --source facebook --id 2
Use case 2: Use GSICrawler to get some tweets, and analyse sentiments with Senpy
--------------------------------------------------------------------------------
This use case improve the use case 1 adding analysis with Senpy. In addition, we change the data source from CNN newspaper to Twitter. Modify the python script adding this lines:
II. Analyse collected tweets and Facebook posts with Senpy
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This phase improve the previous one adding analysis with Senpy.
Below is the detailed part of the task located on `somedi-usecase/workflow.py`.
.. sourcecode:: python
......@@ -53,12 +60,9 @@ This use case improve the use case 1 adding analysis with Senpy. In addition, we
id = luigi.Parameter()
number =luigi.Parameter()
source = luigi.Parameter()
host = 'http://senpy:5000/api/'
algorithm = 'sentiment140'
lang = 'en'
algorithm = luigi.Parameter()
lang = luigi.Parameter()
def requires(self):
return ScrapyTask(self.id,self.query,self.number,self.source)
......@@ -68,16 +72,24 @@ This use case improve the use case 1 adding analysis with Senpy. In addition, we
As shown in the code we select as endpoint our Senpy service and other parameters are going to be given by command line.
Run again the orchestrator:
You must select what Senpy's algorithm and language are going to be used in the analysis.
Run again the orchestrator's workflow using sentiment140 plugin in spanish:
.. sourcecode:: bash
$ docker-compose exec orchestrator python -m luigi --module somedi-usecase.workflow AnalysisTask --query restauranteslateral --number 10 --source facebook --algorithm sentiment140 --lang es --id 3
.. sourcecode:: bash
$ docker-compose exec orchestrator python -m luigi --module orchestrator AnalysisTask --query Trump --number 10 --source twitter --id 2
$ docker-compose exec orchestrator python -m luigi --module somedi-usecase.workflow AnalysisTask --query rest_lateral --number 10 --source twitter --algorithm sentiment140 --lang es --id 4
Use case 3: Use GSICrawler to get some tweets, analyse sentiments with Senpy and store results on Fuseki and Elasticsearch
--------------------------------------------------------------------------------------------------------------------------
III. Store collected and analysed tweets on Fuseki and Elasticsearch
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This use case improve use case 2 adding a persistence layer to store results. Modify the python script adding this lines:
This phase improve the previous one adding a persistence layer to store results.
Below is the detailed part of the task located on `somedi-usecase/workflow.py`.
.. sourcecode:: python
......@@ -87,6 +99,8 @@ This use case improve use case 2 adding a persistence layer to store results. Mo
query = luigi.Parameter()
number = luigi.Parameter()
source = luigi.Parameter()
algorithm = luigi.Parameter()
lang = luigi.Parameter()
host = 'fuseki'
port = 3030
......@@ -102,8 +116,10 @@ This use case improve use case 2 adding a persistence layer to store results. Mo
query = luigi.Parameter()
number = luigi.Parameter()
source = luigi.Parameter()
index = 'soneti'
doc_type = 'news'
algorithm = luigi.Parameter()
lang = luigi.Parameter()
index = 'somedi'
doc_type = 'lateral'
host = 'elasticsearch'
port = 9200
timeout = 100
......@@ -117,15 +133,40 @@ This use case improve use case 2 adding a persistence layer to store results. Mo
query = luigi.Parameter()
number = luigi.Parameter()
source = luigi.Parameter()
algorithm = luigi.Parameter()
lang = luigi.Parameter()
def requires(self):
yield FusekiTask(self.id, self.query, self.number)
yield Elasticsearch(self.id, self.query, self.number)
Run again the orchestrator:
Run again the orchestrator's workflow:
.. sourcecode:: bash
$ docker-compose exec orchestrator python -m luigi --module GSICrawler StoreTask --query Trump --number 10 --source cnn --id 3
$ docker-compose exec orchestrator python -m luigi --module somedi-usecase.workflow StoreTask --query restauranteslateral --number 10 --source facebook --algorithm sentiment140 --lang es --id 5
$ docker-compose exec orchestrator python -m luigi --module somedi-usecase.workflow StoreTask --query rest_lateral --number 10 --source twitter --algorithm sentiment140 --lang es --id 6
Now your data is available on elasticsearch and fuseki.
IV. Show stored data in a Sefarad dashboard
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~