# Implementation of Charikar simhashes in Python
# See: http://dsrg.mff.cuni.cz/~holub/sw/shash/#a1
if hashbits > 64: hashbits = 64
v = *hashbits
for t in :
bitmask = 0
for i in :
bitmask = 1 << i
if t & bitmask:
+= 1
else:
-= 1
fingerprint = 0
for i in :
if >= 0:
fingerprint += 1 << i
return fingerprint
# Use Hamming Distance to return % of similar bits
x = (a ^ b) & ((1 << hashbits) - 1)
tot = 0
while x:
tot += 1
x &= x-1
return /hashbits
test_set =
last = None
for t in test_set:
fingerprint =
print "%35s = %s" % (t, fingerprint)
print "\nHow similar are these?"
a = 'aaaaaaaaaaaaa 111111111111111'
b = 'aaaabaaaaaaaa 111111161111111'
print "'%s' and '%s'? %.2f%%" % ( a, b, *100 )
a = 'The Pursuit of HAPPINESS'
print "'%s' and '%s'? %.2f%%" % ( a, b, *100 )
b = 'the pursuit of happiness'
print "'%s' and '%s'? %.2f%%" % ( a, b, *100 )
b = 'HAPPINESS pursuit'
print "'%s' and '%s'? %.2f%%" % ( a, b, *100 )
b = 'happiness pursuit'
print "'%s' and '%s'? %.2f%%" % ( a, b, *100 )
by : andhy as, Jakarta, Indnesia, 08 2010
b = 'happiness pursuit of the WHATEVER'
print "'%s' and '%s'? %.2f%%" % ( a, b, *100 )
python
|
This entry was posted on 2:15 AM and is filed under
python
. You can follow any responses to this entry through the RSS 2.0 feed. You can leave a response, or trackback from your own site.
0 comments: