|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from itertools import chain |
|
|
|
def pad_sequence( |
|
sequence, |
|
n, |
|
pad_left=False, |
|
pad_right=False, |
|
left_pad_symbol=None, |
|
right_pad_symbol=None, |
|
): |
|
""" |
|
Returns a padded sequence of items before ngram extraction. |
|
>>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')) |
|
['<s>', 1, 2, 3, 4, 5, '</s>'] |
|
>>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>')) |
|
['<s>', 1, 2, 3, 4, 5] |
|
>>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>')) |
|
[1, 2, 3, 4, 5, '</s>'] |
|
:param sequence: the source data to be padded |
|
:type sequence: sequence or iter |
|
:param n: the degree of the ngrams |
|
:type n: int |
|
:param pad_left: whether the ngrams should be left-padded |
|
:type pad_left: bool |
|
:param pad_right: whether the ngrams should be right-padded |
|
:type pad_right: bool |
|
:param left_pad_symbol: the symbol to use for left padding (default is None) |
|
:type left_pad_symbol: any |
|
:param right_pad_symbol: the symbol to use for right padding (default is None) |
|
:type right_pad_symbol: any |
|
:rtype: sequence or iter |
|
""" |
|
sequence = iter(sequence) |
|
if pad_left: |
|
sequence = chain((left_pad_symbol,) * (n - 1), sequence) |
|
if pad_right: |
|
sequence = chain(sequence, (right_pad_symbol,) * (n - 1)) |
|
return sequence |
|
|
|
|
|
|
|
|
|
|
|
def ngrams( |
|
sequence, |
|
n, |
|
pad_left=False, |
|
pad_right=False, |
|
left_pad_symbol=None, |
|
right_pad_symbol=None, |
|
): |
|
""" |
|
Return the ngrams generated from a sequence of items, as an iterator. |
|
For example: |
|
>>> from nltk.util import ngrams |
|
>>> list(ngrams([1,2,3,4,5], 3)) |
|
[(1, 2, 3), (2, 3, 4), (3, 4, 5)] |
|
Wrap with list for a list version of this function. Set pad_left |
|
or pad_right to true in order to get additional ngrams: |
|
>>> list(ngrams([1,2,3,4,5], 2, pad_right=True)) |
|
[(1, 2), (2, 3), (3, 4), (4, 5), (5, None)] |
|
>>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>')) |
|
[(1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')] |
|
>>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>')) |
|
[('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5)] |
|
>>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')) |
|
[('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')] |
|
:param sequence: the source data to be converted into ngrams |
|
:type sequence: sequence or iter |
|
:param n: the degree of the ngrams |
|
:type n: int |
|
:param pad_left: whether the ngrams should be left-padded |
|
:type pad_left: bool |
|
:param pad_right: whether the ngrams should be right-padded |
|
:type pad_right: bool |
|
:param left_pad_symbol: the symbol to use for left padding (default is None) |
|
:type left_pad_symbol: any |
|
:param right_pad_symbol: the symbol to use for right padding (default is None) |
|
:type right_pad_symbol: any |
|
:rtype: sequence or iter |
|
""" |
|
sequence = pad_sequence( |
|
sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol |
|
) |
|
|
|
history = [] |
|
while n > 1: |
|
|
|
try: |
|
next_item = next(sequence) |
|
except StopIteration: |
|
|
|
return |
|
history.append(next_item) |
|
n -= 1 |
|
for item in sequence: |
|
history.append(item) |
|
yield tuple(history) |
|
del history[0] |