giulio98 commited on
Commit
25b3536
1 Parent(s): e0dc47b

Upload utils.py

Browse files
Files changed (1) hide show
  1. utils.py +106 -0
utils.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Natural Language Toolkit: Utility functions
2
+ #
3
+ # Copyright (C) 2001-2020 NLTK Project
4
+ # Author: Steven Bird <stevenbird1@gmail.com>
5
+ # URL: <http://nltk.org/>
6
+ # For license information, see LICENSE.TXT
7
+
8
+ from itertools import chain
9
+
10
+ def pad_sequence(
11
+ sequence,
12
+ n,
13
+ pad_left=False,
14
+ pad_right=False,
15
+ left_pad_symbol=None,
16
+ right_pad_symbol=None,
17
+ ):
18
+ """
19
+ Returns a padded sequence of items before ngram extraction.
20
+ >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
21
+ ['<s>', 1, 2, 3, 4, 5, '</s>']
22
+ >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
23
+ ['<s>', 1, 2, 3, 4, 5]
24
+ >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
25
+ [1, 2, 3, 4, 5, '</s>']
26
+ :param sequence: the source data to be padded
27
+ :type sequence: sequence or iter
28
+ :param n: the degree of the ngrams
29
+ :type n: int
30
+ :param pad_left: whether the ngrams should be left-padded
31
+ :type pad_left: bool
32
+ :param pad_right: whether the ngrams should be right-padded
33
+ :type pad_right: bool
34
+ :param left_pad_symbol: the symbol to use for left padding (default is None)
35
+ :type left_pad_symbol: any
36
+ :param right_pad_symbol: the symbol to use for right padding (default is None)
37
+ :type right_pad_symbol: any
38
+ :rtype: sequence or iter
39
+ """
40
+ sequence = iter(sequence)
41
+ if pad_left:
42
+ sequence = chain((left_pad_symbol,) * (n - 1), sequence)
43
+ if pad_right:
44
+ sequence = chain(sequence, (right_pad_symbol,) * (n - 1))
45
+ return sequence
46
+
47
+
48
+ # add a flag to pad the sequence so we get peripheral ngrams?
49
+
50
+
51
+ def ngrams(
52
+ sequence,
53
+ n,
54
+ pad_left=False,
55
+ pad_right=False,
56
+ left_pad_symbol=None,
57
+ right_pad_symbol=None,
58
+ ):
59
+ """
60
+ Return the ngrams generated from a sequence of items, as an iterator.
61
+ For example:
62
+ >>> from nltk.util import ngrams
63
+ >>> list(ngrams([1,2,3,4,5], 3))
64
+ [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
65
+ Wrap with list for a list version of this function. Set pad_left
66
+ or pad_right to true in order to get additional ngrams:
67
+ >>> list(ngrams([1,2,3,4,5], 2, pad_right=True))
68
+ [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]
69
+ >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
70
+ [(1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
71
+ >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
72
+ [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5)]
73
+ >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
74
+ [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
75
+ :param sequence: the source data to be converted into ngrams
76
+ :type sequence: sequence or iter
77
+ :param n: the degree of the ngrams
78
+ :type n: int
79
+ :param pad_left: whether the ngrams should be left-padded
80
+ :type pad_left: bool
81
+ :param pad_right: whether the ngrams should be right-padded
82
+ :type pad_right: bool
83
+ :param left_pad_symbol: the symbol to use for left padding (default is None)
84
+ :type left_pad_symbol: any
85
+ :param right_pad_symbol: the symbol to use for right padding (default is None)
86
+ :type right_pad_symbol: any
87
+ :rtype: sequence or iter
88
+ """
89
+ sequence = pad_sequence(
90
+ sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol
91
+ )
92
+
93
+ history = []
94
+ while n > 1:
95
+ # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
96
+ try:
97
+ next_item = next(sequence)
98
+ except StopIteration:
99
+ # no more data, terminate the generator
100
+ return
101
+ history.append(next_item)
102
+ n -= 1
103
+ for item in sequence:
104
+ history.append(item)
105
+ yield tuple(history)
106
+ del history[0]