Commit f8ca595b authored by militarpancho's avatar militarpancho
Browse files

Added chunker plugin to tokenize texts

parent 312e7f7f
from senpy.plugins import AnalysisPlugin
from senpy.models import Entry
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize.simple import LineTokenizer
import nltk
class ChunkerPlugin(AnalysisPlugin):
def activate(self):
nltk.download('punkt')
def analyse_entry(self, entry, params):
chunker_type = params.get("type", "sentence")
original_id = entry.id
original_text = entry.get("text", None)
if chunker_type == "sentence":
tokenizer = PunktSentenceTokenizer()
chars = tokenizer.span_tokenize(original_text)
for i, sentence in enumerate(tokenizer.tokenize(original_text)):
e = Entry()
e.text = sentence
e.id = original_id + "#char={},{}".format(chars[i][0], chars[i][1])
yield e
if chunker_type == "paragraph":
tokenizer = LineTokenizer()
chars = tokenizer.span_tokenize(original_text)
for i, paragraph in enumerate(tokenizer.tokenize(original_text)):
e = Entry()
e.text = paragraph
chars = [char for char in chars]
e.id = original_id + "#char={},{}".format(chars[i][0], chars[i][1])
yield e
---
name: chunker
module: chunker
description: A sample plugin that chunks input text
author: "@militarpancho"
version: '0.1'
url: "https://github.com/gsi-upm/senpy"
requirements: {nltk}
extra_params:
type:
aliases:
- type
- t
required: false
default: sentence
options:
- sentence
- paragraph
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment