Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
social
SoMeDi Use Case - Lateral
Commits
464fcdb1
Commit
464fcdb1
authored
Nov 06, 2017
by
Alberto Pascual
Browse files
facebook scraper fixes
parent
fa4ad0a3
Pipeline
#431
passed with stages
in 32 seconds
Changes
8
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
crontasks.py
View file @
464fcdb1
...
...
@@ -15,7 +15,7 @@ def main(args):
for
argument
in
args
[
1
:]:
identifier
=
time
.
time
()
command
=
'python -m luigi --module analysistask PipelineTask --index somedi --doc-type news --url {url} --id "{id}" --analysisType "sentiments,emotions" --num {num}'
.
format
(
url
=
str
(
argument
),
id
=
identifier
,
num
=
int
(
args
[
0
]))
subprocess
.
Popen
(
command
.
split
(),
shell
=
False
)
subprocess
.
call
(
command
.
split
(),
shell
=
False
)
def
cron
(
arg
):
...
...
demo/elements/google-chart-elasticsearch/google-chart.html
View file @
464fcdb1
...
...
@@ -574,7 +574,7 @@ Data can be provided in one of three ways:
return
[
key
,
pp
,
psoe
,
podemos
,
ciudadanos
];
});
console
.
log
(
data
)
}
else
{
...
...
k8s/somedi-deployment.yaml
View file @
464fcdb1
...
...
@@ -2,7 +2,7 @@
apiVersion
:
v1
kind
:
ConfigMap
metadata
:
name
:
somedi-config
name
:
${NAME}-crawler
data
:
ES_ENDPOINT
:
"
$ES_ENDPOINT"
ES_PORT
:
"
$ES_PORT"
...
...
@@ -17,14 +17,14 @@ data:
apiVersion
:
extensions/v1beta1
kind
:
Deployment
metadata
:
name
:
${NAME}
name
:
${NAME}
-crawler
spec
:
replicas
:
1
template
:
metadata
:
labels
:
role
:
somedi
-luigi
app
:
${NAME}
role
:
${NAME}
-luigi
app
:
${NAME}
-luigi
spec
:
imagePullSecrets
:
-
name
:
registry.cluster.gsi.dit.upm.es
...
...
@@ -41,7 +41,7 @@ spec:
containerPort
:
8082
envFrom
:
-
configMapRef
:
name
:
somedi-config
name
:
${NAME}-crawler
---
apiVersion
:
v1
kind
:
ConfigMap
...
...
@@ -54,14 +54,14 @@ data:
apiVersion
:
extensions/v1beta1
kind
:
Deployment
metadata
:
name
:
${NAME}-
deploy
name
:
${NAME}-
web
spec
:
replicas
:
1
template
:
metadata
:
labels
:
role
:
${NAME}-web
app
:
${NAME}
app
:
${NAME}
-web
spec
:
imagePullSecrets
:
-
name
:
registry.cluster.gsi.dit.upm.es
...
...
@@ -78,5 +78,5 @@ spec:
containerPort
:
8090
envFrom
:
-
configMapRef
:
name
:
${NAME}-
config
name
:
${NAME}-
web
k8s/somedi-ingress.yaml
View file @
464fcdb1
...
...
@@ -2,7 +2,7 @@
apiVersion
:
extensions/v1beta1
kind
:
Ingress
metadata
:
name
:
${NAME}
name
:
${NAME}
-crawler
annotations
:
ingress.kubernetes.io/rewrite-target
:
/
spec
:
...
...
@@ -12,13 +12,13 @@ spec:
paths
:
-
path
:
/
backend
:
serviceName
:
${NAME}
serviceName
:
${NAME}
-crawler
servicePort
:
8082
---
apiVersion
:
extensions/v1beta1
kind
:
Ingress
metadata
:
name
:
${NAME}
name
:
${NAME}
-web
annotations
:
ingress.kubernetes.io/rewrite-target
:
/
spec
:
...
...
@@ -28,6 +28,6 @@ spec:
paths
:
-
path
:
/
backend
:
serviceName
:
${NAME}
servicePort
:
80
8
0
serviceName
:
${NAME}
-web
servicePort
:
80
9
0
k8s/somedi-svc.yaml
View file @
464fcdb1
...
...
@@ -2,7 +2,7 @@
apiVersion
:
v1
kind
:
Service
metadata
:
name
:
${NAME}
name
:
${NAME}
-crawler
spec
:
type
:
ClusterIP
ports
:
...
...
@@ -10,16 +10,16 @@ spec:
port
:
8082
protocol
:
TCP
selector
:
role
:
somedi-luigi
role
:
${NAME}-crawler
---
apiVersion
:
v1
kind
:
Service
metadata
:
name
:
$NAME
name
:
$
{
NAME
}-web
spec
:
type
:
ClusterIP
ports
:
-
port
:
80
8
0
-
port
:
80
9
0
protocol
:
TCP
selector
:
role
:
${NAME}-web
...
...
luigi-task-hist.db
View file @
464fcdb1
No preview for this file type
scrapers/__pycache__/facebookScrapper.cpython-36.pyc
View file @
464fcdb1
No preview for this file type
scrapers/facebookScrapper.py
View file @
464fcdb1
...
...
@@ -46,6 +46,8 @@ def request_until_succeed(url):
# - shares&limit= : extrae el numero de veces que se ha compartido la noticia
def
getFBPageFeedData
(
page_id
,
num_status
,
filepath
):
page_idbak
=
page_id
if
page_id
==
"podemos"
:
page_id
=
"ahorapodemos"
if
page_id
==
"ciudadanos"
:
page_id
=
"Cs.Ciudadanos"
...
...
@@ -71,8 +73,8 @@ def getFBPageFeedData (page_id,num_status,filepath):
aux
[
"schema:datePublished"
]
=
post
[
"created_time"
]
aux
[
"schema:articleBody"
]
=
post
[
"message"
]
aux
[
"schema:author"
]
=
'facebook'
aux
[
"schema:creator"
]
=
page_id
aux
[
"schema:search"
]
=
page_id
aux
[
"schema:creator"
]
=
page_id
bak
aux
[
"schema:search"
]
=
page_id
bak
aux
[
'comments'
]
=
post
[
'comments'
]
json
.
dump
(
aux
,
outfile
)
outfile
.
write
(
'
\n
'
)
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment