Faqts : Business : Programming : Shopping For You : Python : Snippets : Strings

+ Search
Add Entry AlertManage Folder Edit Entry Add page to http://del.icio.us/
Did You Find This Entry Useful?

41 of 60 people (68%) answered Yes
Recently 6 of 10 people (60%) answered Yes

Entry

Calculate the Index of Coincidence for a string.

Dec 18th, 2009 15:26
new acct, John Lehmann,


LANGUAGES = { "Arabic" :   0.075889,
			 "Danish" :   0.070731,
			 "Dutch" :    0.079805,
			 "English" :  0.066895,
			 "Finnish" :  0.073796,
			 "French" :   0.074604,
			 "German" :   0.076667,
			 "Greek" :    0.069165,
			 "Hebrew" :   0.076844,
			 "Italian" :  0.073294,
			 "Japanese" : 0.077236,
			 "Malay" :    0.085286,
			 "Norweigian" : 0.069428,
			 "Portuguese" : 0.074528,
			 "Russian" :  0.056074,
			 "Serbo Croatian" : 0.064363,
			 "Spanish" :  0.076613,
			 "Swedish" :  0.064489,
			 "Random" :   0.038461 }

def calculateIC(s):
	"""
	Calculate the index of coincidence.
	
	                 F ( F - 1)
	   IC = (sum of) ----------
	                 N ( N - 1)
	"""
	d = {}
	n = 0
	for c in s:
		c = c.lower()
		if c.isalpha():
			d[c] = d.get(c, 0) + 1
			n += 1
	t = 0
	def rect(x): return float(x) * (x - 1)
	n1 = rect(n)
	for f in d.values():
		t += rect(f)
	k = d.keys()
	k.sort()
	return t / n1

def findLanguage(s):
	ic = calculateIC(s)
	d = {}
	for l in LANGUAGES:
		d[l] = abs( ic - LANGUAGES[l] )
	pairs = d.items()
	pairs.sort(key=lambda x: x[1])
	return pairs[0][0]