Upload fuzz.py

cf8a101 over 4 years ago

9.59 kB

	#!/usr/bin/env python
	# encoding: utf-8
	from __future__ import unicode_literals
	import platform
	import warnings

	try:
	from .StringMatcher import StringMatcher as SequenceMatcher
	except ImportError:
	if platform.python_implementation() != "PyPy":
	warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
	from difflib import SequenceMatcher

	from . import utils


	###########################
	# Basic Scoring Functions #
	###########################

	@utils.check_for_none
	@utils.check_for_equivalence
	@utils.check_empty_string
	def ratio(s1, s2):
	s1, s2 = utils.make_type_consistent(s1, s2)

	m = SequenceMatcher(None, s1, s2)
	return utils.intr(100 * m.ratio())


	@utils.check_for_none
	@utils.check_for_equivalence
	@utils.check_empty_string
	def partial_ratio(s1, s2):
	""""Return the ratio of the most similar substring
	as a number between 0 and 100."""
	s1, s2 = utils.make_type_consistent(s1, s2)

	if len(s1) <= len(s2):
	shorter = s1
	longer = s2
	else:
	shorter = s2
	longer = s1

	m = SequenceMatcher(None, shorter, longer)
	blocks = m.get_matching_blocks()

	# each block represents a sequence of matching characters in a string
	# of the form (idx_1, idx_2, len)
	# the best partial match will block align with at least one of those blocks
	# e.g. shorter = "abcd", longer = XXXbcdeEEE
	# block = (1,3,3)
	# best score === ratio("abcd", "Xbcd")
	scores = []
	for block in blocks:
	long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
	long_end = long_start + len(shorter)
	long_substr = longer[long_start:long_end]

	m2 = SequenceMatcher(None, shorter, long_substr)
	r = m2.ratio()
	if r > .995:
	return 100
	else:
	scores.append(r)

	return utils.intr(100 * max(scores))


	##############################
	# Advanced Scoring Functions #
	##############################

	def _process_and_sort(s, force_ascii, full_process=True):
	"""Return a cleaned string with token sorted."""
	# pull tokens
	ts = utils.full_process(s, force_ascii=force_ascii) if full_process else s
	tokens = ts.split()

	# sort tokens and join
	sorted_string = u" ".join(sorted(tokens))
	return sorted_string.strip()


	# Sorted Token
	# find all alphanumeric tokens in the string
	# sort those tokens and take ratio of resulting joined strings
	# controls for unordered string elements
	@utils.check_for_none
	def _token_sort(s1, s2, partial=True, force_ascii=True, full_process=True):
	sorted1 = _process_and_sort(s1, force_ascii, full_process=full_process)
	sorted2 = _process_and_sort(s2, force_ascii, full_process=full_process)

	if partial:
	return partial_ratio(sorted1, sorted2)
	else:
	return ratio(sorted1, sorted2)


	def token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
	"""Return a measure of the sequences' similarity between 0 and 100
	but sorting the token before comparing.
	"""
	return _token_sort(s1, s2, partial=False, force_ascii=force_ascii, full_process=full_process)


	def partial_token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
	"""Return the ratio of the most similar substring as a number between
	0 and 100 but sorting the token before comparing.
	"""
	return _token_sort(s1, s2, partial=True, force_ascii=force_ascii, full_process=full_process)


	@utils.check_for_none
	def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True):
	"""Find all alphanumeric tokens in each string...
	- treat them as a set
	- construct two strings of the form:
	<sorted_intersection><sorted_remainder>
	- take ratios of those two strings
	- controls for unordered partial matches"""

	if not full_process and s1 == s2:
	return 100

	p1 = utils.full_process(s1, force_ascii=force_ascii) if full_process else s1
	p2 = utils.full_process(s2, force_ascii=force_ascii) if full_process else s2

	if not utils.validate_string(p1):
	return 0
	if not utils.validate_string(p2):
	return 0

	# pull tokens
	tokens1 = set(p1.split())
	tokens2 = set(p2.split())

	intersection = tokens1.intersection(tokens2)
	diff1to2 = tokens1.difference(tokens2)
	diff2to1 = tokens2.difference(tokens1)

	sorted_sect = " ".join(sorted(intersection))
	sorted_1to2 = " ".join(sorted(diff1to2))
	sorted_2to1 = " ".join(sorted(diff2to1))

	combined_1to2 = sorted_sect + " " + sorted_1to2
	combined_2to1 = sorted_sect + " " + sorted_2to1

	# strip
	sorted_sect = sorted_sect.strip()
	combined_1to2 = combined_1to2.strip()
	combined_2to1 = combined_2to1.strip()

	if partial:
	ratio_func = partial_ratio
	else:
	ratio_func = ratio

	pairwise = [
	ratio_func(sorted_sect, combined_1to2),
	ratio_func(sorted_sect, combined_2to1),
	ratio_func(combined_1to2, combined_2to1)
	]
	return max(pairwise)


	def token_set_ratio(s1, s2, force_ascii=True, full_process=True):
	return _token_set(s1, s2, partial=False, force_ascii=force_ascii, full_process=full_process)


	def partial_token_set_ratio(s1, s2, force_ascii=True, full_process=True):
	return _token_set(s1, s2, partial=True, force_ascii=force_ascii, full_process=full_process)


	###################
	# Combination API #
	###################

	# q is for quick
	def QRatio(s1, s2, force_ascii=True, full_process=True):
	"""
	Quick ratio comparison between two strings.

	Runs full_process from utils on both strings
	Short circuits if either of the strings is empty after processing.

	:param s1:
	:param s2:
	:param force_ascii: Allow only ASCII characters (Default: True)
	:full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
	:return: similarity ratio
	"""

	if full_process:
	p1 = utils.full_process(s1, force_ascii=force_ascii)
	p2 = utils.full_process(s2, force_ascii=force_ascii)
	else:
	p1 = s1
	p2 = s2

	if not utils.validate_string(p1):
	return 0
	if not utils.validate_string(p2):
	return 0

	return ratio(p1, p2)


	def UQRatio(s1, s2, full_process=True):
	"""
	Unicode quick ratio

	Calls QRatio with force_ascii set to False

	:param s1:
	:param s2:
	:return: similarity ratio
	"""
	return QRatio(s1, s2, force_ascii=False, full_process=full_process)


	# w is for weighted
	def WRatio(s1, s2, force_ascii=True, full_process=True):
	"""
	Return a measure of the sequences' similarity between 0 and 100, using different algorithms.

	Steps in the order they occur

	#. Run full_process from utils on both strings
	#. Short circuit if this makes either string empty
	#. Take the ratio of the two processed strings (fuzz.ratio)
	#. Run checks to compare the length of the strings
	* If one of the strings is more than 1.5 times as long as the other
	use partial_ratio comparisons - scale partial results by 0.9
	(this makes sure only full results can return 100)
	* If one of the strings is over 8 times as long as the other
	instead scale by 0.6

	#. Run the other ratio functions
	* if using partial ratio functions call partial_ratio,
	partial_token_sort_ratio and partial_token_set_ratio
	scale all of these by the ratio based on length
	* otherwise call token_sort_ratio and token_set_ratio
	* all token based comparisons are scaled by 0.95
	(on top of any partial scalars)

	#. Take the highest value from these results
	round it and return it as an integer.

	:param s1:
	:param s2:
	:param force_ascii: Allow only ascii characters
	:type force_ascii: bool
	:full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
	:return:
	"""

	if full_process:
	p1 = utils.full_process(s1, force_ascii=force_ascii)
	p2 = utils.full_process(s2, force_ascii=force_ascii)
	else:
	p1 = s1
	p2 = s2

	if not utils.validate_string(p1):
	return 0
	if not utils.validate_string(p2):
	return 0

	# should we look at partials?
	try_partial = True
	unbase_scale = .95
	partial_scale = .90

	base = ratio(p1, p2)
	len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2))

	# if strings are similar length, don't use partials
	if len_ratio < 1.5:
	try_partial = False

	# if one string is much much shorter than the other
	if len_ratio > 8:
	partial_scale = .6

	if try_partial:
	partial = partial_ratio(p1, p2) * partial_scale
	ptsor = partial_token_sort_ratio(p1, p2, full_process=False) \
	* unbase_scale * partial_scale
	ptser = partial_token_set_ratio(p1, p2, full_process=False) \
	* unbase_scale * partial_scale

	return utils.intr(max(base, partial, ptsor, ptser))
	else:
	tsor = token_sort_ratio(p1, p2, full_process=False) * unbase_scale
	tser = token_set_ratio(p1, p2, full_process=False) * unbase_scale

	return utils.intr(max(base, tsor, tser))


	def UWRatio(s1, s2, full_process=True):
	"""Return a measure of the sequences' similarity between 0 and 100,
	using different algorithms. Same as WRatio but preserving unicode.
	"""
	return WRatio(s1, s2, force_ascii=False, full_process=full_process)