Giter Club home page Giter Club logo

sketchy's People

Stargazers

 avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar

Watchers

 avatar  avatar  avatar  avatar  avatar  avatar

sketchy's Issues

sketch and LSH

Hi, all
I use the random projection to generate sketch for vector, it seem the sketch algorithm is work fine. But I want to find k nearest vector, I write a simple LSH index based on sketch, the code is here:

'''
This module implements the Locality Sensitive Hashing index
with MinHash (MinHash LSH) and Weighted MinHash (Weighted MinHash LSH)
Both indexes supports query with Jaccard similarity threshold.

Reference: Chapter 3, Mining of Massive Datasets
(http://www.mmds.org/)
'''
from collections import defaultdict
import math


_integration_precision = 0.001
def _integration(f, a, b):
    p = _integration_precision
    area = 0.0
    x = a
    while x < b:
        area += f(x+0.5*p)*p
        x += p
    return area, None

try:
    from scipy.integrate import quad as integrate
except ImportError:
    # For when no scipy installed
    integrate = _integration


def _false_positive_probability(threshold, b, r):
    _probability = lambda s : 1 - (1 - s**float(r))**float(b)
    a, err = integrate(_probability, 0.0, threshold)
    return a


def _false_negative_probability(threshold, b, r):
    _probability = lambda s : 1 - (1 - (1 - s**float(r))**float(b))
    a, err = integrate(_probability, threshold, 1.0)
    return a


def _optimal_param(threshold, num_perm, false_positive_weight,
        false_negative_weight):
    '''
    Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
    of probabilities of false positive and false negative.
    '''
    min_error = float("inf")
    opt = (0, 0)
    for b in range(1, num_perm+1):
        max_r = int(num_perm / b)
        for r in range(1, max_r+1):
            fp = _false_positive_probability(threshold, b, r)
            fn = _false_negative_probability(threshold, b, r)
            error = fp*false_positive_weight + fn*false_negative_weight
            if error < min_error:
                min_error = error
                opt = (b, r)
    return opt


class SketchLSH(object):
    '''
    The sketch LSH
    '''

    def __init__(self, threshold=0.9, num_perm=128, weights=(0.5,0.5)):
        if threshold > 1.0 or threshold < -1.0:
            raise ValueError("threshold must be in [-1.0, 1.0]")
        if num_perm < 2:
            raise ValueError("Too few permutation functions")
        if any(w < 0.0 or w > 1.0 for w in weights):
            raise ValueError("Weight must be in [0.0, 1.0]")
        if sum(weights) != 1.0:
            raise ValueError("Weights must sum to 1.0")

        # sketch lsh(d1, d2, p1, p2)
        # p1 = (180-d1)/180 where d1 and d2 is in degree
        threshold_degree = math.acos(threshold)/math.pi*180
        prob_threshold = (180-threshold_degree)/180
        threshold = prob_threshold

        self.threshold = threshold
        self.h = num_perm
        false_positive_weight, false_negative_weight = weights
        self.b, self.r = _optimal_param(threshold, num_perm,
                false_positive_weight, false_negative_weight)
        self.hashtables = [defaultdict(list) for _ in range(self.b)]
        self.hashranges = [(i*self.r, (i+1)*self.r) for i in range(self.b)]
        self.keys = dict()

    def split(self, sketch):
        if len(sketch) != self.h:
            raise ValueError("Expecting minhash with length %d, got %d"
                    % (self.h, len(minhash)))
        band_values = [sketch[start:end] for start, end in self.hashranges]
        return band_values

I use sparse_random_projection to generate sketch for vector, then use LSH to index it.

from sketchy import sparse_random_projection
import sys

def bitfield(n, size):
    b = [str(n >> i & 1) for i in range(size-1, -1, -1)]
    return ''.join(b)

def test_vector():
    hash_size = 500
    sketch_lsh = SketchLSH(threshold=0.7, num_perm=hash_size)

    for line in sys.stdin:
        fields = line.split()
        uid = fields[0]
        vec = map(float, fields[1:])
        hash_val = sparse_random_projection(vec, hash_size, 100, 1013)
        bit_val = bitfield(hash_val, hash_size)
        band_val = sketch_lsh.split(bit_val)
        for index, value in enumerate(band_val):
            print '%s:%s\t%s' % (index, value, uid)

here is my vector file test_vec.txt

these two user 9884538920 5228117548, the similarity between them is 0.223718682305,
but my LSH index make them as candidate.

cat test_vec.txt | grep -P  '(9884538920|5228117548)' | python hash_vector.py | awk '{print $1}' | sort -k1 | uniq -c | grep ' *2 '

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    ๐Ÿ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. ๐Ÿ“Š๐Ÿ“ˆ๐ŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google โค๏ธ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.