Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
afnan47
GitHub Repository: afnan47/sem7
Path: blob/main/IR/Assignment 1/one_similiarity.py
419 views
1
import math
2
import string
3
import sys
4
5
6
def read_file(filename):
7
8
try:
9
with open(filename, 'r') as f:
10
data = f.read()
11
return data
12
13
except IOError:
14
print("Error opening or reading input file: ", filename)
15
sys.exit()
16
17
18
translation_table = str.maketrans(string.punctuation+string.ascii_uppercase," "*len(string.punctuation)+string.ascii_lowercase)
19
20
21
def get_words_from_line_list(text):
22
23
text = text.translate(translation_table)
24
word_list = text.split()
25
26
return word_list
27
28
29
30
def count_frequency(word_list):
31
32
D = {}
33
34
for new_word in word_list:
35
36
if new_word in D:
37
D[new_word] = D[new_word] + 1
38
39
else:
40
D[new_word] = 1
41
42
return D
43
44
45
def word_frequencies_for_file(filename):
46
47
line_list = read_file(filename)
48
word_list = get_words_from_line_list(line_list)
49
freq_mapping = count_frequency(word_list)
50
51
print("File", filename, ":", )
52
print(len(line_list), "lines, ", )
53
print(len(word_list), "words, ", )
54
print(len(freq_mapping), "distinct words")
55
56
return freq_mapping
57
58
59
60
def dotProduct(D1, D2):
61
Sum = 0.0
62
63
for key in D1:
64
65
if key in D2:
66
Sum += (D1[key] * D2[key])
67
68
return Sum
69
70
71
def vector_angle(D1, D2):
72
numerator = dotProduct(D1, D2)
73
denominator = math.sqrt(dotProduct(D1, D1)*dotProduct(D2, D2))
74
75
return math.acos(numerator / denominator)
76
77
78
def documentSimilarity(filename_1, filename_2):
79
80
81
sorted_word_list_1 = word_frequencies_for_file(filename_1)
82
sorted_word_list_2 = word_frequencies_for_file(filename_2)
83
distance = vector_angle(sorted_word_list_1, sorted_word_list_2)
84
85
print("The distance between the documents is: % 0.6f (radians)"% distance)
86
87
88
documentSimilarity('sample1.txt', 'sample2.txt')
89
90
91
#OUTPUT
92
93
# File sample1.txt :
94
# 598 lines,
95
# 113 words,
96
# 66 distinct words
97
# File sample2.txt :
98
# 779 lines,
99
# 154 words,
100
# 89 distinct words
101
# The distance between the documents is: 0.618456 (radians)
102