Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
robertopucp
GitHub Repository: robertopucp/1eco35_2022_2
Path: blob/main/Lab9/script_fuzzy_py.ipynb
2714 views
Kernel: Python 3 (ipykernel)

Fuzzy match

autor: Roberto Mendoza

# !pip install fuzzywuzzy # !pip install python-Levenshtein # !pip install swifter
Requirement already satisfied: rapidfuzz in c:\users\roberto\anaconda3\lib\site-packages (2.12.0)
# from fuzzywuzzy import fuzz # from fuzzywuzzy import process
# !pip install rapidfuzz from rapidfuzz import fuzz # cargar fuzzy mattch libraries from rapidfuzz import process import re # regular expresion import numpy as np import pandas as pd import swifter # procesos paralelos (comptudara más más rapida) import unidecode # to drop tildes import itertools
# Comparamos nombres # Caso 1 name1 = "Juan Pablo Villanueva Melcochita" name2 = "juan pablo! villanueva 5 melcochita.." print(name1 is name2) print(name1 == name2)
False False
fuzz.ratio(name1,name2)
81.15942028985508
# Se necesita en pre-procesamiento # retirar numeros, puntucioanes # retirar tuldes # minuscula fuzz.ratio(name1.lower(), re.sub('[^a-zA-Z\s]', '',name2).lower() )
98.46153846153847
re.sub('[^a-zA-Z\s]', '',name2)
'juan pablo villanueva melcochita'
print(name1.lower()) print( re.sub('[^a-zA-Z\s]', '',name2).lower() ) print(name1.lower() is re.sub('[^a-zA-Z\s]', '', name2).lower())
juan pablo villanueva melcochita juan pablo villanueva melcochita False
# Caso 2: ausencia de palabras name1 = "Juan Pablo Villanueva Melcochita" name2 = "Juan melcochita" print(fuzz.ratio(name1,name2))
59.57446808510638
# partial_ratio analiza ambos string y elige el string de menor longitud para compararlo con el otro string # En efecto, partial_ratio elige name2 y, a aprtir de ello, se compara con name1 print(fuzz.partial_ratio(name1.lower(),name2.lower()))
88.88888888888889
# Caso 3: ordenamiento diferentes # token preprocesa los strings: minuscula, elimina puntuaciones (,.?"$") name1 = "Juan Pablo Villanueva" name2 = "Villanueva Juan Pablo" print(fuzz.ratio(name1.lower(),name2.lower())) print(fuzz.partial_ratio(name1.lower(),name2.lower())) fuzz.token_sort_ratio(name1.lower(),name2.lower())
47.61904761904761 64.51612903225806
100.0
# Caso4: Repitición de palabras usar token_set_ratio name1 = "Juan Pablo Villanueva" name2 = "Villanueva Villanueva Juan Pablo PABLO" print(fuzz.ratio(name1.lower(),name2.lower())) print(fuzz.partial_ratio(name1.lower(),name2.lower())) print( fuzz.token_sort_ratio(name1.lower(),name2.lower()) ) print( fuzz.token_set_ratio(name1.lower(),name2.lower()) )
47.45762711864406 66.66666666666667 71.1864406779661 100.0
name1 = "Juan Pablo Villanueva" name3 = "JuUan Po..%?+ 435, illanuevA$" print(fuzz.ratio(name1.lower(),name3.lower())) print(fuzz.partial_ratio(name1,name3)) print( fuzz.token_sort_ratio(name1,name3)) print( fuzz.token_set_ratio(name1, name3 ) )
68.0 56.25 46.51162790697675 46.51162790697674
# Comparación lista_nombres = ['juan gutierrez', 'Maria flores', 'Paty nuñez', 'Pablo miranda', 'villa juan'] # ranking y score print( process.extract(name1, lista_nombres) )
[('juan gutierrez', 85.5, 0), ('Pablo miranda', 85.5, 3), ('villa juan', 85.5, 4), ('Paty nuñez', 45.6, 2), ('Maria flores', 35.625, 1)]
# cantidad minima de match print(process.extract(name1, lista_nombres, limit =3))
[('juan gutierrez', 85.5, 0), ('Pablo miranda', 85.5, 3), ('villa juan', 85.5, 4)]
# best score process.extractOne(name1, lista_nombres)
('juan gutierrez', 85.5, 0)
# Usando score diferentes #1. fuzz ratio print( process.extract(name1, lista_nombres, scorer = fuzz.ratio, limit =2 )) #2. fuzz partial ratio print( process.extract(name1, lista_nombres, scorer = fuzz.partial_ratio, limit =2 )) #3. fuzz token sort ratio print( process.extract(name1, lista_nombres, scorer = fuzz.token_sort_ratio , limit =2 )) #4. fuzz token set ratio print( process.extract(name1, lista_nombres, scorer = fuzz.token_set_ratio , limit =2 ))
[('Pablo miranda', 58.82352941176471, 3), ('villa juan', 45.16129032258065, 4)] [('villa juan', 70.0, 4), ('Pablo miranda', 69.23076923076923, 3)] [('villa juan', 64.51612903225806, 4), ('Pablo miranda', 47.05882352941176, 3)] [('villa juan', 64.51612903225806, 4), ('Pablo miranda', 58.8235294117647, 3)]
matches = process.extract(name1, lista_nombres, scorer = fuzz.ratio , limit = 3 ) matches
[('Pablo miranda', 58.82352941176471, 3), ('villa juan', 45.16129032258065, 4), ('juan gutierrez', 40.0, 0)]
# función def therehold(x, min_score): scoring = x[1] if scoring < min_score: return (np.nan, np.nan) else: return (x[0], x[1]) list(map(lambda x: therehold(x, 50) , matches))
[('Pablo miranda', 58.82352941176471), (nan, nan), (nan, nan)]
# fuzz.ratio def fuzz_ratio(row, column_data2: pd.Series, min_score): output = process.extract(row, column_data2, scorer = fuzz.ratio , limit = 3 ) output = list(map(lambda x: therehold(x, min_score) , output)) return list(itertools.chain(*output)) # fuzz.partial_ratio def fuzz_partial_ratio(row, column_data2: pd.Series, min_score): output = process.extract(row, column_data2, scorer = fuzz.partial_ratio, limit = 3 ) output = list(map(lambda x: therehold(x, min_score) , output)) return list(itertools.chain(*output)) # fuzz.token_sort_ratio def fuzz_token_sort_ratio(row, column_data2: pd.Series, min_score): output = process.extract(row, column_data2, scorer = fuzz.token_sort_ratio, limit = 3 ) output = list(map(lambda x: therehold(x, min_score) , output)) return list(itertools.chain(*output)) # fuzz.token_sort_ratio def fuzz_token_set_ratio(row, column_data2: pd.Series, min_score): output = process.extract(row, column_data2, scorer = fuzz.token_set_ratio , limit = 3 ) output = list(map(lambda x: therehold(x, min_score) , output)) return list(itertools.chain(*output))
# Load datasets data_1 = pd.read_excel(r'../data/Fuzzy/nombres.xlsx', sheet_name = 'Hoja1') data_2 = pd.read_excel(r'../data/Fuzzy/nombres.xlsx', sheet_name = 'Hoja2')
data_1
data_2
# function: elimina tildes, espacios a los aldos y convierte a minuscula def function1(row): row = row.strip() row = unidecode.unidecode(row) return row.lower() # function: elimina tildes, espacios a los aldos, solo se queda con letras y espacios # , y , finalmente, convierte a minuscula def function2(row): row = row.strip() row = unidecode.unidecode(row) row = re.sub('[^a-zA-Z\s]', '',row).lower() return row
data_1['Nombre'] = data_1['Nombre'].apply(function1) data_2['Nombre'] = data_2['Nombre'].apply(function2)
data_1['partial_ratio'] = data_1['Nombre'].swifter.apply(lambda x: fuzz_partial_ratio(x, data_2['Nombre'], min_score = 70))
Pandas Apply: 0%| | 0/122 [00:00<?, ?it/s]
data_1[['partial_ratio']]
match_partial_ratio = pd.DataFrame(data_1['partial_ratio'].values.tolist()) match_partial_ratio.rename(columns = {0:"partial_match_name_1", 1:"partial_match_score_1", 2:"partial_match_name_2", 3:"partial_match_score_2", 4:"partial_match_name_3", 5:"partial_match_score_3"}, inplace = True) data_1 = pd.concat([data_1, match_partial_ratio], axis = 1)
match_partial_ratio
data_1
data_fuzzy_match = pd.merge(data_1 , data_2, left_on = "partial_match_name_1" , right_on = "Nombre", how = "left", validate = "m:1", suffixes=('', '_y')).merge( data_2, left_on = "partial_match_name_2" , right_on = "Nombre", how = "left", validate = "m:1", suffixes=('', '_z') ) data_fuzzy_match
data_fuzzy_match.rename(columns = {"Mtematica":"Matematica_match_1", "Letras":"Letras_match_1", "Mtematica_z":"Matematica_match_2", "Letras_z":"Letras_match_2"}, inplace = True) del data_fuzzy_match['Nombre_y'] del data_fuzzy_match['Nombre_z'] data_fuzzy_match