| |
| |
| |
| |
| |
| |
|
|
| from itertools import chain |
|
|
| def pad_sequence( |
| sequence, |
| n, |
| pad_left=False, |
| pad_right=False, |
| left_pad_symbol=None, |
| right_pad_symbol=None, |
| ): |
| """ |
| Returns a padded sequence of items before ngram extraction. |
| >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')) |
| ['<s>', 1, 2, 3, 4, 5, '</s>'] |
| >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>')) |
| ['<s>', 1, 2, 3, 4, 5] |
| >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>')) |
| [1, 2, 3, 4, 5, '</s>'] |
| :param sequence: the source data to be padded |
| :type sequence: sequence or iter |
| :param n: the degree of the ngrams |
| :type n: int |
| :param pad_left: whether the ngrams should be left-padded |
| :type pad_left: bool |
| :param pad_right: whether the ngrams should be right-padded |
| :type pad_right: bool |
| :param left_pad_symbol: the symbol to use for left padding (default is None) |
| :type left_pad_symbol: any |
| :param right_pad_symbol: the symbol to use for right padding (default is None) |
| :type right_pad_symbol: any |
| :rtype: sequence or iter |
| """ |
| sequence = iter(sequence) |
| if pad_left: |
| sequence = chain((left_pad_symbol,) * (n - 1), sequence) |
| if pad_right: |
| sequence = chain(sequence, (right_pad_symbol,) * (n - 1)) |
| return sequence |
|
|
|
|
| |
|
|
|
|
| def ngrams( |
| sequence, |
| n, |
| pad_left=False, |
| pad_right=False, |
| left_pad_symbol=None, |
| right_pad_symbol=None, |
| ): |
| """ |
| Return the ngrams generated from a sequence of items, as an iterator. |
| For example: |
| >>> from nltk.util import ngrams |
| >>> list(ngrams([1,2,3,4,5], 3)) |
| [(1, 2, 3), (2, 3, 4), (3, 4, 5)] |
| Wrap with list for a list version of this function. Set pad_left |
| or pad_right to true in order to get additional ngrams: |
| >>> list(ngrams([1,2,3,4,5], 2, pad_right=True)) |
| [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)] |
| >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>')) |
| [(1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')] |
| >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>')) |
| [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5)] |
| >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')) |
| [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')] |
| :param sequence: the source data to be converted into ngrams |
| :type sequence: sequence or iter |
| :param n: the degree of the ngrams |
| :type n: int |
| :param pad_left: whether the ngrams should be left-padded |
| :type pad_left: bool |
| :param pad_right: whether the ngrams should be right-padded |
| :type pad_right: bool |
| :param left_pad_symbol: the symbol to use for left padding (default is None) |
| :type left_pad_symbol: any |
| :param right_pad_symbol: the symbol to use for right padding (default is None) |
| :type right_pad_symbol: any |
| :rtype: sequence or iter |
| """ |
| sequence = pad_sequence( |
| sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol |
| ) |
|
|
| history = [] |
| while n > 1: |
| |
| try: |
| next_item = next(sequence) |
| except StopIteration: |
| |
| return |
| history.append(next_item) |
| n -= 1 |
| for item in sequence: |
| history.append(item) |
| yield tuple(history) |
| del history[0] |