Commit 43b4cc0a authored by Óscar Araque's avatar Óscar Araque
Browse files

first commit

parents
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
from icecream import ic
import pandas as pd
import pickle
import scattertext as st
import spacy
nlp = spacy.load("en_core_web_sm")
def main():
print('Reading data')
data = pd.read_pickle('data/conserva_wiki_processed_full.pck')
simlengths = pd.read_csv('export/exact_simscores_lengths.tsv', sep='\t')
# # filter considering the length ratio
# conserva_keep_ids = set(simlengths[simlengths['Length_Ratio_processed']<= 10]['Conservapedia_ID'].values)
# wiki_keep_ids = set(simlengths[simlengths['Length_Ratio_processed']<= 10]['Wikipedia_ID'].values)
# conserva_texts = data[data['conserva_id'].isin(conserva_keep_ids)]['conserva_text']
# wiki_texts = data[data['wiki_id'].isin(wiki_keep_ids)]['wiki_text']
with open('conserva_wiki_selection_ids.pck', 'rb') as f:
all_ids = pickle.load(f)
conserva_texts = data[data['conserva_id'].isin(all_ids['conserva'])]['conserva_text_processed']
wiki_texts = data[data['wiki_id'].isin(all_ids['wiki'])]['wiki_text_processed']
def words_to_remove(w):
w = w.lower()
stopwords = set(['num', 'nums'])
if w in stopwords or len(w) < 4:
return False
else:
return True
conserva_texts = conserva_texts.str.split().apply(lambda ws: [w for w in ws if words_to_remove(w)]).apply(' '.join)
wiki_texts = wiki_texts.str.split().apply(lambda ws: [w for w in ws if words_to_remove(w)]).apply(' '.join)
ic(len(conserva_texts))
ic(len(wiki_texts))
data_cats = pd.DataFrame(data=pd.concat((conserva_texts, wiki_texts)),
columns=['text',])
ic(data_cats.shape)
data_cats['category'] = ['conserva', ] * len(conserva_texts) + ['wiki'] * len(wiki_texts)
print("Data shape:", data_cats.shape)
print('Performing analysis')
corpus = st.CorpusFromPandas(data_cats,
category_col='category',
text_col='text',
nlp=st.whitespace_nlp_with_sentences).build()
term_freq_df = corpus.get_term_freq_df()
term_freq_df['Conservative Score'] = corpus.get_scaled_f_scores('conserva')
conv_list = list(term_freq_df.sort_values(by='Conservative Score', ascending=False).index[:])
pd.DataFrame(data=conv_list).head(100).to_csv('export/conv_list.csv', header=None, index=False)
term_freq_df['Liberal Score'] = corpus.get_scaled_f_scores('wiki')
libe_list = list(term_freq_df.sort_values(by='Liberal Score', ascending=False).index[:])
pd.DataFrame(data=libe_list).head(100).to_csv('export/libe_list.csv', header=None, index=False)
lists = pd.DataFrame(data=[conv_list, libe_list])
html = st.produce_scattertext_explorer(corpus,
category='conserva',
category_name='Conservative',
not_category_name='Liberal',
width_in_pixels=1000, metadata=None,
minimum_term_frequency=20, save_svg_button=True,
)
open("export/conv_libe.html", 'wb').write(html.encode('utf-8'))
print('Done')
if __name__ == '__main__':
main()
#!/usr/bin/env python
# coding: utf-8
import re
import sys
import pickle
import pandas as pd
import numpy as np
import spacy
nlp = spacy.load("en_core_web_sm")
nlp_reduced = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
# # Read dataset
# In[4]:
import os
from tqdm import tqdm
def read_file(path, ignore_title=True):
with open(path, 'r', encoding='utf-8') as f:
content = f.read()
if ignore_title:
content = content.split('\n\n', maxsplit=1)[-1]
return content
def read_wiki_conserva(data_path, include_redirects=False):
index_path = os.path.join(data_path, 'aligned/exact.tsv')
index = pd.read_csv(index_path, sep='\t')
dataset = []
for i in tqdm(index.index):
info = index.loc[i]
new_id = info['New_ID']
file_id_path = os.path.join(data_path, 'aligned/exact/{}'.format(new_id))
conserva_content = read_file('{}.conserva'.format(file_id_path), ignore_title=True)
wiki_content = read_file('{}.wiki'.format(file_id_path), ignore_title=True)
dataset.append([info['Conservapedia_title'], info['Conservapedia_ID'], conserva_content,
info['Wikipedia_title'], info['Wikipedia_ID'], wiki_content])
if include_redirects:
pass
dataset = pd.DataFrame(data=dataset, columns=[
'conserva_title', 'conserva_id', 'conserva_text', 'wiki_title', 'wiki_id', 'wiki_text',
])
return dataset
def remove_references(text):
# to substitute wiki-like references
return re.sub(r"\[[1-9]+\]", "", text)
def spacy_pprocess(docs):
docs_ = list()
for doc in docs:
doc_ = list()
for token in doc:
if token.is_stop:
continue
if token.is_punct:
continue
token_ = token.lemma_.lower()
if token.like_num:
token_ = 'NUM'
token_ = re.sub(r"[^A-Za-z0-9().,!?\'\`]", "", token_)
token_ = re.sub(r"[0-9]+", "NUM", token_)
doc_.append(token_)
doc_ = ' '.join(doc_)
docs_.append(doc_)
return docs_
def main():
print('Loading dataset')
dataset = read_wiki_conserva('data/wikipedia_convervapedia/')
print('Done')
print('Starting NLP pipe')
docs = dict()
docs['conserva'] = nlp_reduced.pipe(dataset['conserva_text'].apply(remove_references).values)
docs['conserva'] = list(docs['conserva'])
print('Done on conservative docs')
print('Starting pipe on wiki docs')
docs['wiki'] = nlp_reduced.pipe(dataset['wiki_text'].apply(remove_references).values)
docs['wiki'] = list(docs['wiki'])
print('Done on wiki docs')
print('Starting post-processing')
text = dict()
text['conserva'] = spacy_pprocess(docs['conserva'])
text['wiki'] = spacy_pprocess(docs['wiki'])
print('Done')
assert len(text['conserva']) == len(text['wiki'])
# save full dataset
dataset['conserva_text_processed'] = text['conserva']
dataset['wiki_text_processed'] = text['wiki']
dataset.to_pickle('data/conserva_wiki_processed_full.pck')
data_cats = pd.DataFrame(data=(text['conserva'] + text['wiki']), columns=['text',])
data_cats['category'] = ['conserva', ] * dataset.shape[0] + ['wiki'] * dataset.shape[0]
data_cats.shape
# save processed dataset
with open('data/conserva_wiki_processed.pck', 'wb') as f:
pickle.dump(data_cats, f)
print('Done')
if __name__ == '__main__':
main()
private
property
conserve
norm
tradition
nation
traditional
right
conventional
orthodox
preserve
national
army
family
bank
capital
republican
country
liberty
society
free
freedom
choice
equal
reformist
libertarian
rational
broad-minded
high-minded
indulgent
intelligent
reasonable
unbiased
unbigoted
unconventional
from icecream import ic
import os
import sys
import operator
import pickle
from tqdm import tqdm
import pandas as pd
import numpy as np
from gensim.corpora.dictionary import Dictionary
from gensim.topic_coherence import text_analysis
from gensim.models.doc2vec import Doc2Vec
def compute_dictionary(corpus):
dictionary = Dictionary(corpus)
dict_sorted = sorted(dictionary.token2id.items(), key=operator.itemgetter(1), reverse=True)
dict_words = [e[0] for e in dict_sorted]
dict_indexes = [e[1] for e in dict_sorted]
return dictionary, dict_sorted, dict_words, dict_indexes
def cooccurence_matrix(corpus, context, window_size=10, processes=1):
print('Creating dictionary')
dictionary, dict_sorted, dict_words, dict_indexes = compute_dictionary(corpus)
print('Done')
aggregator = text_analysis.ParallelWordOccurrenceAccumulator(processes, dict_indexes, dictionary)
aggregator.accumulate(corpus, window_size)
M = []
for w_i in tqdm(dict_words):
m_tmp = []
for w_j in context:
try:
cocc = aggregator.get_co_occurrences(w_i, w_j)
except KeyError as err:
print(err, file=sys.stderr)
cocc = 0
m_tmp.append(cocc)
M.append(m_tmp)
return np.array(M), dictionary.token2id, dict_words
def pmi(counts, token2id, dict_words, context2id, positive_seeds, negative_seeds, smooth=0.1):
"""
Learns valence scores using PMI.
Adapted from Turney, P. and M. Littman. "Measuring Praise and Criticism: Inference of semantic orientation from association".
ACM Trans. Inf. Sys., 2003. 21(4) 315-346.
Adapted from https://github.com/williamleif/socialsent
"""
polarities = {}
for w in dict_words:
if w not in positive_seeds and w not in negative_seeds:
pol = sum(np.log(counts[token2id[w], context2id[seed]] + smooth)
- np.log(counts[token2id[seed],:].sum()) for seed in positive_seeds)
pol -= sum(np.log(counts[token2id[w], context2id[seed]] + smooth)
- np.log(counts[token2id[seed],:].sum())for seed in negative_seeds)
polarities[w] = pol
return polarities
def emb(corpus, model, positive_seeds, negative_seeds):
#print('Creating dictionary')
#dictionary, dict_sorted, dict_words, dict_indexes = compute_dictionary(corpus)
#print('Done')
# if seeds are not in word embedding, drop them
wv_vocab = set(model.index2word)
positive_seeds = [s for s in positive_seeds if s in wv_vocab]
negative_seeds = [s for s in negative_seeds if s in wv_vocab]
#ic(positive_seeds)
#ic(negative_seeds)
#sys.exit(0)
polarities = dict()
#for w in tqdm(dict_words, file=sys.stdout):
for w in tqdm(model.index2word, file=sys.stdout):
if w not in positive_seeds and w not in negative_seeds:
try:
pol = sum(model.similarity(w, seed) for seed in positive_seeds)
pol -= sum(model.similarity(w, seed) for seed in negative_seeds)
polarities[w] = pol
except KeyError as err:
print(err, file=sys.stderr)
continue
return polarities
def load_embeddings(path):
'''return keyedvectors, we dont need the full doc2vec model'''
model = Doc2Vec.load(path)
return model.wv
def read_seeds(path):
positive_seeds = list(pd.read_csv(os.path.join(path, 'positive.txt'), header=None)[0])
negative_seeds = list(pd.read_csv(os.path.join(path, 'negative.txt'), header=None)[0])
return positive_seeds, negative_seeds
def save_valences(path, obj):
vals = pd.DataFrame.from_dict(obj, orient='index')[0]
vals.to_csv(path, sep='\t', header=False)
def main(args):
# read dataset
with open(args.data_path, 'rb') as f:
data_cats = pickle.load(f)
corpus = data_cats['text'].str.split().values
# read seeds
positive_seeds, negative_seeds = read_seeds(args.seeds_path)
context = positive_seeds + negative_seeds
print('Seed words:')
print(' - '.join(context)); print()
if args.method == 'pmi':
print('Computing co-occurence matrix')
cocc, token2id, dict_words = cooccurence_matrix(corpus, context, processes=args.processes)
print('Done')
print('Computing PMI valences')
context2id = {w: context.index(w) for w in context}
valences = pmi(cocc, token2id, dict_words, context2id, positive_seeds, negative_seeds)
print('Done')
elif args.method == 'emb':
print('Loading embedding model')
emb_model = load_embeddings(args.emb_path)
print('Done')
print('Computing emb valences')
valences = emb(corpus, emb_model, positive_seeds, negative_seeds)
print('Done')
print('Saving valences')
save_valences(args.lex_path, valences)
print('Done')
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Extract moral valences from seeds.')
parser.add_argument('data_path', type=str, help='dataset path')
parser.add_argument('seeds_path', type=str, help='seeds folder path')
parser.add_argument('--method', type=str, help='method to used: pmi or emb', default='emb')
parser.add_argument('--lex-path', type=str, help='path to save valences', default='valences.tsv')
parser.add_argument('--processes', type=int, help='no of parallel processed', default=1)
parser.add_argument('--emb-path', type=str, help='path to emb model')
args = parser.parse_args()
main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment