Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
robertopucp
GitHub Repository: robertopucp/1eco35_2022_2
Path: blob/main/Lab9/script_fuzzy_r.R
2714 views
1
################ Clase 9 Fuzzy match ############################
2
## Curso: Laboratorio de R y Python ###########################
3
## @author: Roberto Mendoza
4
5
# clear environment
6
rm(list=ls(all=TRUE))
7
8
# Load libraries ----
9
10
11
library(tidyverse)
12
library(readxl)
13
library(stringdist) # indicadores de similaridad
14
library(fuzzyjoin) # merge dataste using names o text column
15
16
17
user <- Sys.getenv("USERNAME") # username
18
19
setwd( paste0("C:/Users/",user,"/Documents/GitHub/1ECO35_2022_2/Lab9") ) # set directorio
20
21
# Load excel files ----
22
23
master <- read_excel("../data/Fuzzy/nombres.xlsx", sheet = "Hoja1")
24
25
26
usdata <- read_excel("../data/Fuzzy/nombres.xlsx", sheet = "Hoja2")
27
28
29
# Comparamos nombres
30
# Caso 1
31
32
name1 <- "Juan Pablo Villanueva Melcochita"
33
name2 <- c("juan pablo! villanueva 5 melcochita..", "Jose fabricio")
34
35
36
# Uso de help para
37
# help("stringdist")
38
39
# Fuzzy match ----
40
41
# distances basado en Levenshtein distance
42
43
44
stringdist(name1, name2, method = "lv")
45
46
# Jaro-Winkler method
47
48
similarites <- 1 - stringdist(name1, name2, method = "jw")
49
50
print(similarites)
51
52
# cosine method
53
54
similarites <- 1 - stringdist(name1, name2, method = "cosine")
55
56
print(similarites)
57
58
# help('stringdist-metrics')
59
60
# Caso 2: ausencia de palabras
61
62
name2 <- "Juan melcochita"
63
64
stringdist(name1, name2, method = "lv")
65
66
# Jaro-Winkler method
67
68
similarites <- 1 - stringdist(name1, name2, method = "jw")
69
70
print(similarites)
71
72
# cosine method
73
74
similarites <- 1 - stringdist(name1, name2, method = "cosine")
75
print(similarites)
76
77
78
# Caso 3: ordenamiento diferentes
79
80
81
name2 <- "Villanueva Juan Pablo"
82
83
# distances basado en Levenshtein distance
84
85
stringdist(name1, name2, method = "lv")
86
87
# Jaro-Winkler method
88
89
similarites <- 1 - stringdist(name1, name2, method = "jw")
90
91
print(similarites)
92
93
# cosine method
94
95
similarites <- 1 - stringdist(name1, name2, method = "cosine")
96
print(similarites)
97
98
# Caso4: Repitición de palabras
99
100
101
name2 <- "Villanueva Villanueva Juan Pablo PABLO"
102
103
# distances basado en Levenshtein distance
104
105
stringdist(name1, name2, method = "lv")
106
107
# Jaro-Winkler method
108
109
similarites <- 1 - stringdist(name1, name2, method = "jw")
110
111
print(similarites)
112
113
# cosine method
114
115
similarites <- 1 - stringdist(name1, name2, method = "cosine")
116
print(similarites)
117
118
119
# Fuzzy join ----
120
121
122
# Levenshtein distance
123
124
data_match_lv <- stringdist_join(
125
master, usdata,
126
by = "Nombre",
127
method = "lv",
128
mode = "left",max_dist=99, # maxima cantidad de cambios
129
ignore_case = T, distance_col='lv_score'
130
)
131
132
# Jaro-Winkler method
133
134
data_match_jw <- stringdist_join(
135
master, usdata,
136
by = "Nombre",
137
method = "jw",
138
mode = "left",
139
ignore_case = T, distance_col='jw_score'
140
)
141
142
# cosine method
143
144
data_match_cs <- stringdist_join(
145
master, usdata,
146
by = "Nombre",
147
method = "cosine",
148
mode = "left",
149
ignore_case = T, distance_col='cs_score'
150
)
151
152
153
# seleccioanndo los 3 mejores matches
154
155
data_match_lv <- stringdist_join(
156
master, usdata,
157
by = "Nombre",
158
method = "lv",
159
mode = "left",max_dist=99, # maxima cantidad de cambios
160
ignore_case = T, distance_col='lv_score'
161
) %>%
162
group_by(Nombre.x) %>%
163
slice_min(order_by=lv_score, n=3)
164
165
166
167
168
169
170
171
172