rm(list=ls(all=TRUE))
library(tidyverse)
library(readxl)
library(stringdist)
library(fuzzyjoin)
user <- Sys.getenv("USERNAME")
setwd( paste0("C:/Users/",user,"/Documents/GitHub/1ECO35_2022_2/Lab9") )
master <- read_excel("../data/Fuzzy/nombres.xlsx", sheet = "Hoja1")
usdata <- read_excel("../data/Fuzzy/nombres.xlsx", sheet = "Hoja2")
name1 <- "Juan Pablo Villanueva Melcochita"
name2 <- c("juan pablo! villanueva 5 melcochita..", "Jose fabricio")
stringdist(name1, name2, method = "lv")
similarites <- 1 - stringdist(name1, name2, method = "jw")
print(similarites)
similarites <- 1 - stringdist(name1, name2, method = "cosine")
print(similarites)
name2 <- "Juan melcochita"
stringdist(name1, name2, method = "lv")
similarites <- 1 - stringdist(name1, name2, method = "jw")
print(similarites)
similarites <- 1 - stringdist(name1, name2, method = "cosine")
print(similarites)
name2 <- "Villanueva Juan Pablo"
stringdist(name1, name2, method = "lv")
similarites <- 1 - stringdist(name1, name2, method = "jw")
print(similarites)
similarites <- 1 - stringdist(name1, name2, method = "cosine")
print(similarites)
name2 <- "Villanueva Villanueva Juan Pablo PABLO"
stringdist(name1, name2, method = "lv")
similarites <- 1 - stringdist(name1, name2, method = "jw")
print(similarites)
similarites <- 1 - stringdist(name1, name2, method = "cosine")
print(similarites)
data_match_lv <- stringdist_join(
master, usdata,
by = "Nombre",
method = "lv",
mode = "left",max_dist=99,
ignore_case = T, distance_col='lv_score'
)
data_match_jw <- stringdist_join(
master, usdata,
by = "Nombre",
method = "jw",
mode = "left",
ignore_case = T, distance_col='jw_score'
)
data_match_cs <- stringdist_join(
master, usdata,
by = "Nombre",
method = "cosine",
mode = "left",
ignore_case = T, distance_col='cs_score'
)
data_match_lv <- stringdist_join(
master, usdata,
by = "Nombre",
method = "lv",
mode = "left",max_dist=99,
ignore_case = T, distance_col='lv_score'
) %>%
group_by(Nombre.x) %>%
slice_min(order_by=lv_score, n=3)