split.py 1.82 KB
Newer Older
J. Fernando Sánchez's avatar
J. Fernando Sánchez committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14
from senpy.plugins import AnalysisPlugin
from senpy.models import Entry
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize.simple import LineTokenizer
import nltk


class SplitPlugin(AnalysisPlugin):

    def activate(self):
        nltk.download('punkt')

    def analyse_entry(self, entry, params):
        chunker_type = params.get("delimiter", "sentence")
militarpancho's avatar
Fix #48  
militarpancho committed
15
        original_text = entry['nif:isString']
J. Fernando Sánchez's avatar
J. Fernando Sánchez committed
16 17 18 19
        if chunker_type == "sentence":
            tokenizer = PunktSentenceTokenizer()
        if chunker_type == "paragraph":
            tokenizer = LineTokenizer()
militarpancho's avatar
Fix #48  
militarpancho committed
20
        chars = list(tokenizer.span_tokenize(original_text))
J. Fernando Sánchez's avatar
J. Fernando Sánchez committed
21
        for i, chunk in enumerate(tokenizer.tokenize(original_text)):
militarpancho's avatar
Fix #48  
militarpancho committed
22
            print(chunk)
J. Fernando Sánchez's avatar
J. Fernando Sánchez committed
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
            e = Entry()
            e['nif:isString'] = chunk
            if entry.id:
                e.id = entry.id + "#char={},{}".format(chars[i][0], chars[i][1])
            yield e

    test_cases = [
        {
            'entry': {
                'nif:isString': 'Hello. World.'
            },
            'params': {
                'delimiter': 'sentence',
            },
            'expected': [
                {
                    'nif:isString': 'Hello.'
                },
                {
                    'nif:isString': 'World.'
                }
            ]
        },
        {
            'entry': {
                "id": ":test",
militarpancho's avatar
Fix #48  
militarpancho committed
49
                'nif:isString': 'Hello\nWorld'
J. Fernando Sánchez's avatar
J. Fernando Sánchez committed
50 51
            },
            'params': {
militarpancho's avatar
Fix #48  
militarpancho committed
52
                'delimiter': 'paragraph',
J. Fernando Sánchez's avatar
J. Fernando Sánchez committed
53 54 55
            },
            'expected': [
                {
militarpancho's avatar
Fix #48  
militarpancho committed
56 57
                    "@id": ":test#char=0,5",
                    'nif:isString': 'Hello'
J. Fernando Sánchez's avatar
J. Fernando Sánchez committed
58 59
                },
                {
militarpancho's avatar
Fix #48  
militarpancho committed
60 61
                    "@id": ":test#char=6,11",
                    'nif:isString': 'World'
J. Fernando Sánchez's avatar
J. Fernando Sánchez committed
62 63 64 65
                }
            ]
        }
    ]