Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sagemath
GitHub Repository: sagemath/sage
Path: blob/develop/src/sage/monoids/string_ops.py
4055 views
1
"Utility functions on strings"
2
3
# ****************************************************************************
4
# Copyright (C) 2007 David Kohel <[email protected]>
5
#
6
# Distributed under the terms of the GNU General Public License (GPL)
7
#
8
# https://www.gnu.org/licenses/
9
# ****************************************************************************
10
from typing import Any
11
12
from sage.misc.lazy_import import lazy_import
13
from .string_monoid_element import StringMonoidElement
14
15
lazy_import('sage.rings.real_mpfr', 'RealField')
16
17
18
def strip_encoding(S) -> str:
19
"""
20
Return the upper case string of S stripped of all non-alphabetic characters.
21
22
EXAMPLES::
23
24
sage: S = "The cat in the hat."
25
sage: strip_encoding(S)
26
'THECATINTHEHAT'
27
28
TESTS::
29
30
sage: strip_encoding(44)
31
Traceback (most recent call last):
32
...
33
TypeError: argument S (= 44) must be a string
34
"""
35
if not isinstance(S, str):
36
raise TypeError(f"argument S (= {S}) must be a string")
37
return ''.join(letter.upper() for letter in S if letter.isalpha())
38
39
40
def frequency_distribution(S, n=1, field=None):
41
"""
42
The probability space of frequencies of n-character substrings of S.
43
44
EXAMPLES::
45
46
sage: frequency_distribution('banana not a nana nor ananas', 2)
47
Discrete probability space defined by {' a': 0.0740740740740741,
48
' n': 0.111111111111111,
49
'a ': 0.111111111111111,
50
'an': 0.185185185185185,
51
'as': 0.0370370370370370,
52
'ba': 0.0370370370370370,
53
'na': 0.222222222222222,
54
'no': 0.0740740740740741,
55
'or': 0.0370370370370370,
56
'ot': 0.0370370370370370,
57
'r ': 0.0370370370370370,
58
't ': 0.0370370370370370}
59
"""
60
from sage.probability.random_variable import DiscreteProbabilitySpace
61
if isinstance(S, tuple):
62
S = list(S)
63
elif isinstance(S, (str, StringMonoidElement)):
64
S = [S[i:i+n] for i in range(len(S)-n+1)]
65
if field is None:
66
field = RealField()
67
if isinstance(S, list):
68
P: dict[str, Any] = {}
69
N = len(S)
70
eps = field.one() / N
71
for i in range(N):
72
c = S[i]
73
if c in P:
74
P[c] += eps
75
else:
76
P[c] = eps
77
return DiscreteProbabilitySpace(S, P, field)
78
raise TypeError("Argument S (= %s) must be a string, list, or tuple.")
79
80
81
def coincidence_index(S, n=1):
82
"""
83
Return the coincidence index of the string ``S``.
84
85
EXAMPLES::
86
87
sage: S = strip_encoding("The cat in the hat.")
88
sage: coincidence_index(S)
89
0.120879120879121
90
"""
91
if not isinstance(S, str):
92
try:
93
S.coincidence_index(n)
94
except AttributeError:
95
raise TypeError("Argument S (= %s) must be a string.")
96
S = strip_encoding(S)
97
N = len(S)-n+1
98
X: dict[str, int] = {}
99
for i in range(N):
100
c = S[i:i+n]
101
if c in X:
102
X[c] += 1
103
else:
104
X[c] = 1
105
RR = RealField()
106
return RR(sum([m*(m-1) for m in X.values()]))/RR(N*(N-1))
107
108
109
def coincidence_discriminant(S, n=2):
110
"""
111
INPUT:
112
113
- ``S`` --tuple of strings; e.g. produced as decimation of transposition
114
ciphertext, or a sample plaintext
115
116
OUTPUT:
117
118
A measure of the difference of probability of association of
119
character pairs, relative to their independent one-character probabilities.
120
121
EXAMPLES::
122
123
sage: S = strip_encoding("The cat in the hat.")
124
sage: coincidence_discriminant([ S[i:i+2] for i in range(len(S)-1) ])
125
0.0827001855677322
126
"""
127
if not isinstance(S, (list, tuple)):
128
raise TypeError("Argument S (= %s) must be a list or tuple" % S)
129
if n != 2:
130
raise ValueError("Argument n (= %s) is only implemented for n = 2" % n)
131
if not all(isinstance(c, (str, StringMonoidElement)) for c in S):
132
raise TypeError("Argument S (= %s) must be a list of strings.")
133
if not all(len(c) == n for c in S):
134
raise ValueError("Argument S (= %s) must be a list of strings of length 2" % S)
135
X1 = [frequency_distribution([s[i] for s in S]) for i in range(2)]
136
XX = frequency_distribution(S)
137
if isinstance(S[0], StringMonoidElement):
138
M = S[0].parent()
139
n = M.ngens()
140
return sum([(XX(M([i, j]))-X1[0](M([i]))*X1[1](M([j])))**2 for i in range(n) for j in range(n)])
141
AZ = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
142
return sum([(XX(AZ[i]+AZ[j])-X1[0](AZ[i])*X1[1](AZ[j]))**2 for i in range(26) for j in range(26)])
143
144