Source code for alis.feature_extraction.shingles

"""
Functions for extracting different types of
shingles given a text.
"""

import hashlib

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


def hash_function(text, n):
    """Return the integer hash function representation given text and
    the number of buckets n

    Parameters
    ----------
    text : str
        String of text whose hash representation is to be computed
    n : int
        The number definining the bucket size: 2**n - 1

    Returns
    -------
    hashed_int : int
        Hashed representation of the given text
    """
    return int(hashlib.sha1(text.encode('utf-8')).hexdigest(), 16) % (2**n - 1)


[docs]def k_shingles(text, k): """Return all the k-shingles of the text Parameters ---------- text : str String of text in which shingles are to be extracted k : int Shingle size Returns ------- shingles : iterable An iterable of all k-shingles in the input text """ return set([text[i:i+k] for i in range(0, len(text) - k + 1)])
[docs]def hashed_shingles(text, k, n): """Return all the `k`-shingles in the given `text` hashed into a bucket number in the range 0 to 2**`n` - 1 Parameters ---------- text : str String of text in which shingles are to be extracted k : int Shingle size n : int The number defining the bucket size 2**n - 1 Returns ------- shingles : iterable of int An iterable of all k-shingles in the input text hashed into buckets """ return set([ hash_function(text[i:i+k], n) for i in range(0, len(text) - k + 1)])
[docs]def word_shingles(text, k, stop_words=None): """Return the list of word `k`-shingles from the given text based on a given stop words. We define a shingle to be a stop word followed by the next `k-1` words regardless of whether the next words were stop words or not. Parameters ---------- text : str String of text whose word shingles are to be extracted k : int Shingle size stop_words : iterabe of str, default=None List of stop words to be used. By default, uses the English stopwords defined by sklearn Returns ------- shingles : iterable of str A list containing the extracted word shingles in a document. """ # Define stop words if none is given if stop_words is None: stop_words = ENGLISH_STOP_WORDS splitted_text = text.split() return set([ ' '.join(splitted_text[i:i + k]) for i in range(len(splitted_text) - k + 1) if splitted_text[i] in stop_words ])
[docs]def hashed_word_shingles(text, k, n, stop_words=None): """Return the list of word `k`-shingles from the given text based on a given stop words then hases it into a bucket with range 0 to 2**n - 1. We define a shingle to be a stop word followed by the next `k-1` words regardless of whether the next words were stop words or not. Parameters ---------- text : str String of text whose word shingles are to be extracted k : int Shingle size n : int The number defining the bucket size 2**n - 1 stop_words : iterabe of str, default=None List of stop words to be used. By default, uses the English stopwords defined by sklearn Returns ------- shingles : iterable of int A list containing the extracted word shingles in hashed representation. """ # Define stop words if none is given if stop_words is None: stop_words = ENGLISH_STOP_WORDS splitted_text = text.split() return set([ hash_function(' '.join(splitted_text[i:i + k]), n) for i in range(len(splitted_text) - k + 1) if splitted_text[i] in stop_words ])