Commit 9d8117eb authored by Alberto Pascual's avatar Alberto Pascual

docs updates, somedi use-case added

parent 54667754
...@@ -26,12 +26,26 @@ services: ...@@ -26,12 +26,26 @@ services:
ports: ports:
- "3030:3030" - "3030:3030"
volumes: volumes:
- fusekidata:/fuseki - ./fuseki:/fuseki
environment: environment:
- ADMIN_PASSWORD=fusekisoneti - ADMIN_PASSWORD=fusekisoneti
networks: networks:
- soneti - soneti
sefarad:
build: ./somedi-usecase/sefarad/
ports:
- "8080:8080"
environment:
- ES_ENDPOINT_EXTERNAL=http://localhost:9200
- FUSEKI_ENDPOINT_EXTERNAL=localhost:3030
volumes:
- ./somedi-usecase/sefarad/:/usr/src/app
depends_on:
- gsicrawler
networks:
- soneti
senpy: senpy:
image: gsiupm/senpy image: gsiupm/senpy
command: --default-plugins command: --default-plugins
...@@ -56,8 +70,8 @@ services: ...@@ -56,8 +70,8 @@ services:
volumes: volumes:
- esdata:/usr/share/elasticsearch/data/ - esdata:/usr/share/elasticsearch/data/
ports: ports:
- 19200:9200 - 9200:9200
- 19300:9300 - 9300:9300
networks: networks:
- soneti - soneti
...@@ -72,5 +86,4 @@ networks: ...@@ -72,5 +86,4 @@ networks:
volumes: volumes:
esdata: esdata:
fusekidata:
# Licensed under the terms of http://www.apache.org/licenses/LICENSE-2.0
## Fuseki Server configuration file.
@prefix : <#> .
@prefix fuseki: <http://jena.apache.org/fuseki#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .
[] rdf:type fuseki:Server ;
# Example::
# Server-wide query timeout.
#
# Timeout - server-wide default: milliseconds.
# Format 1: "1000" -- 1 second timeout
# Format 2: "10000,60000" -- 10s timeout to first result,
# then 60s timeout for the rest of query.
#
# See javadoc for ARQ.queryTimeout for details.
# This can also be set on a per dataset basis in the dataset assembler.
#
# ja:context [ ja:cxtName "arq:queryTimeout" ; ja:cxtValue "30000" ] ;
# Add any custom classes you want to load.
# Must have a "public static void init()" method.
# ja:loadClass "your.code.Class" ;
# End triples.
.
@prefix : <http://base/#> .
@prefix tdb: <http://jena.hpl.hp.com/2008/tdb#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix fuseki: <http://jena.apache.org/fuseki#> .
:service_tdb_all a fuseki:Service ;
rdfs:label "TDB default" ;
fuseki:dataset :tdb_dataset_readwrite ;
fuseki:name "default" ;
fuseki:serviceQuery "query" , "sparql" ;
fuseki:serviceReadGraphStore "get" ;
fuseki:serviceReadWriteGraphStore
"data" ;
fuseki:serviceUpdate "update" ;
fuseki:serviceUpload "upload" .
:tdb_dataset_readwrite
a tdb:DatasetTDB ;
tdb:location "/fuseki/databases/default" .
1
\ No newline at end of file
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[main]
# Development
ssl.enabled = false
plainMatcher=org.apache.shiro.authc.credential.SimpleCredentialsMatcher
#iniRealm=org.apache.shiro.realm.text.IniRealm
iniRealm.credentialsMatcher = $plainMatcher
#localhost=org.apache.jena.fuseki.authz.LocalhostFilter
[users]
# Implicitly adds "iniRealm = org.apache.shiro.realm.text.IniRealm"
admin=fusekisoneti
[roles]
[urls]
## Control functions open to anyone
/$/status = anon
/$/ping = anon
## and the rest are restricted
/$/** = authcBasic,user[admin]
## If you want simple, basic authentication user/password
## on the operations,
## 1 - set a password in [users]
## 2 - change the line above to:
## /$/** = authcBasic,user[admin]
## and set a better
## or to allow any access.
##/$/** = anon
# Everything else
/**=anon
{
"tdb.file_mode" : "direct" ,
"tdb.block_size" : 1024 ,
"tdb.block_read_cache_size" : 50 ,
"tdb.block_write_cache_size" : 20 ,
"tdb.node2nodeid_cache_size" : 500 ,
"tdb.nodeid2node_cache_size" : 500 ,
"tdb.node_miss_cache_size" : 100 ,
"tdb.index_node2id" : "node2id" ,
"tdb.index_id2node" : "nodes" ,
"tdb.triple_index_primary" : "SPO" ,
"tdb.triple_indexes" : [
"SPO" ,
"POS" ,
"OSP"
] ,
"tdb.quad_index_primary" : "GSPO" ,
"tdb.quad_indexes" : [
"GSPO" ,
"GPOS" ,
"GOSP" ,
"POSG" ,
"OSPG" ,
"SPOG"
] ,
"tdb.prefix_index_primary" : "GPU" ,
"tdb.prefix_indexes" : [ "GPU" ] ,
"tdb.file_prefix_index" : "prefixIdx" ,
"tdb.file_prefix_nodeid" : "prefix2id" ,
"tdb.file_prefix_id2node" : "prefixes"
}
1
\ No newline at end of file
@prefix : <http://base/#> .
@prefix tdb: <http://jena.hpl.hp.com/2008/tdb#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix fuseki: <http://jena.apache.org/fuseki#> .
:service_tdb_all a fuseki:Service ;
rdfs:label "TDB default" ;
fuseki:dataset :tdb_dataset_readwrite ;
fuseki:name "default" ;
fuseki:serviceQuery "query" , "sparql" ;
fuseki:serviceReadGraphStore "get" ;
fuseki:serviceReadWriteGraphStore
"data" ;
fuseki:serviceUpdate "update" ;
fuseki:serviceUpload "upload" .
:tdb_dataset_readwrite
a tdb:DatasetTDB ;
tdb:location "/fuseki/databases/default" .
# Licensed under the terms of http://www.apache.org/licenses/LICENSE-2.0
@prefix : <#> .
@prefix fuseki: <http://jena.apache.org/fuseki#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix tdb: <http://jena.hpl.hp.com/2008/tdb#> .
@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .
## ---------------------------------------------------------------
## Updatable in-memory dataset.
<#service1> rdf:type fuseki:Service ;
# URI of the dataset -- http://host:port/{NAME}
fuseki:name "{NAME}" ;
fuseki:serviceQuery "sparql" ;
fuseki:serviceQuery "query" ;
fuseki:serviceUpdate "update" ;
fuseki:serviceUpload "upload" ;
fuseki:serviceReadWriteGraphStore "data" ;
fuseki:serviceReadGraphStore "get" ;
fuseki:dataset <#dataset> ;
.
# Transactional, in-memory dataset. Initially empty.
<#dataset> rdf:type ja:DatasetTxnMem .
# Licensed under the terms of http://www.apache.org/licenses/LICENSE-2.0
@prefix : <#> .
@prefix fuseki: <http://jena.apache.org/fuseki#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix tdb: <http://jena.hpl.hp.com/2008/tdb#> .
@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .
## ---------------------------------------------------------------
## Read-only in-memory dataset - used as a default, dummy datasets
<#service1> rdf:type fuseki:Service ;
fuseki:name "" ;
fuseki:serviceQuery "sparql" ;
fuseki:serviceQuery "query" ;
fuseki:serviceReadGraphStore "get" ;
fuseki:dataset <#dataset> ;
.
## In-memory, empty.
<#dataset> rdf:type ja:RDFDataset .
# Licensed under the terms of http://www.apache.org/licenses/LICENSE-2.0
@prefix : <#> .
@prefix fuseki: <http://jena.apache.org/fuseki#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix tdb: <http://jena.hpl.hp.com/2008/tdb#> .
@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .
## ---------------------------------------------------------------
## Updatable TDB dataset with all services enabled.
<#service_tdb_all> rdf:type fuseki:Service ;
rdfs:label "TDB {NAME}" ;
fuseki:name "{NAME}" ;
fuseki:serviceQuery "query" ;
fuseki:serviceQuery "sparql" ;
fuseki:serviceUpdate "update" ;
fuseki:serviceUpload "upload" ;
fuseki:serviceReadWriteGraphStore "data" ;
# A separate read-only graph store endpoint:
fuseki:serviceReadGraphStore "get" ;
fuseki:dataset <#tdb_dataset_readwrite> ;
.
<#tdb_dataset_readwrite> rdf:type tdb:DatasetTDB ;
tdb:location "{FUSEKI_BASE}/databases/{NAME}" ;
##ja:context [ ja:cxtName "arq:queryTimeout" ; ja:cxtValue "3000" ] ;
##tdb:unionDefaultGraph true ;
.
# Licensed under the terms of http://www.apache.org/licenses/LICENSE-2.0
@prefix : <#> .
@prefix fuseki: <http://jena.apache.org/fuseki#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix tdb: <http://jena.hpl.hp.com/2008/tdb#> .
@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .
## ---------------------------------------------------------------
## Updatable TDB dataset with all services enabled.
<#service_tdb_all> rdf:type fuseki:Service ;
rdfs:label "TDB {NAME}" ;
fuseki:name "{NAME}" ;
fuseki:serviceQuery "query" ;
fuseki:serviceQuery "sparql" ;
fuseki:serviceUpdate "update" ;
fuseki:serviceUpload "upload" ;
fuseki:serviceReadWriteGraphStore "data" ;
# A separate read-only graph store endpoint:
fuseki:serviceReadGraphStore "get" ;
fuseki:dataset <#tdb_dataset_readwrite> ;
.
<#tdb_dataset_readwrite> rdf:type tdb:DatasetTDB ;
tdb:location "{DIR}" ;
##tdb:unionDefaultGraph true ;
.
# Licensed under the terms of http://www.apache.org/licenses/LICENSE-2.0
@prefix : <#> .
@prefix fuseki: <http://jena.apache.org/fuseki#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix tdb: <http://jena.hpl.hp.com/2008/tdb#> .
@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> .
## ---------------------------------------------------------------
## Updatable TDB dataset im-memory with all services enabled.
<#service_tdb_all> rdf:type fuseki:Service ;
rdfs:label "TDB {NAME}" ;
fuseki:name "{NAME}" ;
fuseki:serviceQuery "query" ;
fuseki:serviceQuery "sparql" ;
fuseki:serviceUpdate "update" ;
fuseki:serviceUpload "upload" ;
fuseki:serviceReadWriteGraphStore "data" ;
# A separate read-only graph store endpoint:
fuseki:serviceReadGraphStore "get" ;
fuseki:dataset <#tdb_dataset_readwrite> ;
.
<#tdb_dataset_readwrite> rdf:type tdb:DatasetTDB ;
tdb:location "--mem--" ;
## tdb:unionDefaultGraph true ;
.
orchestrator @ bff06cc1
Subproject commit 51cbed2230fb278e54d646264578ca2874597f74 Subproject commit bff06cc179745c434e63d06f7a7650a2aa4b6fa1
...@@ -6,6 +6,13 @@ ...@@ -6,6 +6,13 @@
Welcome to Soneti's documentation! Welcome to Soneti's documentation!
================================== ==================================
**Soneti** is a toolkit for **analyzing social media**, such as social networks (e.g. Twitter, Facebook, ...), blogs, YouTube, Newspapers, AppStores, etc..
It obtains data from different sources, in addition it enriches this obtained data by performing different types of automatic analysis. Finally, it allows us to visualize the data obtained in interactive dashboards.
.. figure:: figures/soneti.png
:alt: Soneti overview
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
:caption: Contents: :caption: Contents:
......
============ ===========
Installation Quick start
============ ===========
Soneti's installation is quite easy and in a few steps you can have the toolkit installed and ready to use. Soneti's services installation is quite easy and in a few steps you can have a demo services working.
First of all it is necessary to clone the GitLab repository: First of all it is necessary to clone the GitLab repository:
.. sourcecode:: bash .. sourcecode:: bash
...@@ -17,11 +17,12 @@ Now images are ready to run: ...@@ -17,11 +17,12 @@ Now images are ready to run:
$ docker-compose up --build $ docker-compose up --build
Check Senpy service is working on http://localhost:8000, GSICrawler service is working on http://localhost:5000 and Sefarad is working on http://localhost:8080 This installation offers a basic version of each service:
Also is necessary to initialize Fuseki dataset in order to be able to store some data. Browse to http://localhost:3030/manage.html?tab=new-dataset. User and password required are admin and fusekisoneti respectively. * **GSICrawler:** This ingestion service demo has CNN, New York Times, ElMundo, Facebook and Twitter as possible sources. This service is available on http://localhost:5000
Name the dataset default and select Persistent option. Click on create dataset button. * **Senpy**: This analysis service demo has sentiment140 as sentiment analysis plugin and EmoRand as emotion analysis plugin. This service is available on http://localhost:8000/
Now your Soneti toolkit is ready to work. * **Sefarad**: This visualization demo environment provides a dashboard for Somedi project and is available on http://localhost:8080.
* **Orchestrator**: Luigi provides a web interface to check your workflows status on http://localhost:8082
...@@ -2,22 +2,21 @@ ...@@ -2,22 +2,21 @@
Use cases Use cases
========= =========
In this documentation we are going to show some uses of Soneti toolkit using the orchestrator. In this documentation we are going to show some uses of Soneti toolkit.
First of all, you need to create a python script named `orchestrator.py` on root folder and add our dependencies: SOMEDI: Social Media and Digital Interaction Intelligence
---------------------------------------------------------
.. sourcecode:: python This use case is part of the SOMEDI project. In this use case we are going to track Restaurantes Lateral brand on social media.
import luigi We are going to describe this use case in different incremental phases.
from luigi.contrib.esindex import CopyToIndex
from orchestrator.SenpyAnalysis import SenpyAnalysis
from orchestrator.GSICrawlerScraper import GSICrawlerScraper
from orchestrator.CopyToFuseki import CopyToFuseki
Use case 1: Use GSICrawler to get some news I. Use GSICrawler to get tweets and Facebook posts from official accounts
------------------------------------------- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This use case is going to retrieve the latest news on CNN newspaper about Trump, for doing so is necessary to add some lines to our python script created before: This phase gets tweets and Facebook posts from official accounts and shows results printed.
Below is the detailed part of the task located on `somedi-usecase/workflow.py`.
.. sourcecode:: python .. sourcecode:: python
...@@ -31,19 +30,27 @@ This use case is going to retrieve the latest news on CNN newspaper about Trump, ...@@ -31,19 +30,27 @@ This use case is going to retrieve the latest news on CNN newspaper about Trump,
def output(self): def output(self):
return luigi.LocalTarget(path='/tmp/_scrapy-%s.json' % self.id) return luigi.LocalTarget(path='/tmp/_scrapy-%s.json' % self.id)
As shown in the code we select as endpoint our GSICrawler service and other parameters are going to be given by command line. As shown in the code we select as endpoint our GSICrawler demo service and other parameters are going to be given by command line.
Run the orchestrator: Run the orchestrator's workflow to retrieve the 10 latests tweets:
.. sourcecode:: bash .. sourcecode:: bash
$ docker-compose exec orchestrator python -m luigi --module orchestrator ScrapyTask --query Trump --number 10 --source cnn --id 1 $ docker-compose exec orchestrator python -m luigi --module somedi-usecase.workflow ScrapyTask --query rest_lateral --number 10 --source twitter --id 1
Now run the orchestrator's workflow to retrieve the 10 latests facebook posts, the query must be the official account name on Facebook without @:
.. sourcecode:: bash
$ docker-compose exec orchestrator python -m luigi --module somedi-usecase.workflow ScrapyTask --query restauranteslateral --number 10 --source facebook --id 2
Use case 2: Use GSICrawler to get some tweets, and analyse sentiments with Senpy
--------------------------------------------------------------------------------
This use case improve the use case 1 adding analysis with Senpy. In addition, we change the data source from CNN newspaper to Twitter. Modify the python script adding this lines: II. Analyse collected tweets and Facebook posts with Senpy
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This phase improve the previous one adding analysis with Senpy.
Below is the detailed part of the task located on `somedi-usecase/workflow.py`.
.. sourcecode:: python .. sourcecode:: python
...@@ -53,12 +60,9 @@ This use case improve the use case 1 adding analysis with Senpy. In addition, we ...@@ -53,12 +60,9 @@ This use case improve the use case 1 adding analysis with Senpy. In addition, we
id = luigi.Parameter() id = luigi.Parameter()
number =luigi.Parameter() number =luigi.Parameter()
source = luigi.Parameter() source = luigi.Parameter()
host = 'http://senpy:5000/api/' host = 'http://senpy:5000/api/'