CoCalc -- string

GitHub Repository: sagemath/sage
Path: blob/develop/src/sage/monoids/string_ops.py
⁴⁰⁵⁵ views
1
"Utility functions on strings"
2

3
# ****************************************************************************
4
#       Copyright (C) 2007 David Kohel <[email protected]>
5
#
6
#  Distributed under the terms of the GNU General Public License (GPL)
7
#
8
#                  https://www.gnu.org/licenses/
9
# ****************************************************************************
10
from typing import Any
11

12
from sage.misc.lazy_import import lazy_import
13
from .string_monoid_element import StringMonoidElement
14

15
lazy_import('sage.rings.real_mpfr', 'RealField')
16

17

18
def strip_encoding(S) -> str:
19
    """
20
    Return the upper case string of S stripped of all non-alphabetic characters.
21

22
    EXAMPLES::
23

24
        sage: S = "The cat in the hat."
25
        sage: strip_encoding(S)
26
        'THECATINTHEHAT'
27

28
    TESTS::
29

30
        sage: strip_encoding(44)
31
        Traceback (most recent call last):
32
        ...
33
        TypeError: argument S (= 44) must be a string
34
    """
35
    if not isinstance(S, str):
36
        raise TypeError(f"argument S (= {S}) must be a string")
37
    return ''.join(letter.upper() for letter in S if letter.isalpha())
38

39

40
def frequency_distribution(S, n=1, field=None):
41
    """
42
    The probability space of frequencies of n-character substrings of S.
43

44
    EXAMPLES::
45

46
        sage: frequency_distribution('banana not a nana nor ananas', 2)
47
        Discrete probability space defined by {' a': 0.0740740740740741,
48
         ' n': 0.111111111111111,
49
         'a ': 0.111111111111111,
50
         'an': 0.185185185185185,
51
         'as': 0.0370370370370370,
52
         'ba': 0.0370370370370370,
53
         'na': 0.222222222222222,
54
         'no': 0.0740740740740741,
55
         'or': 0.0370370370370370,
56
         'ot': 0.0370370370370370,
57
         'r ': 0.0370370370370370,
58
         't ': 0.0370370370370370}
59
    """
60
    from sage.probability.random_variable import DiscreteProbabilitySpace
61
    if isinstance(S, tuple):
62
        S = list(S)
63
    elif isinstance(S, (str, StringMonoidElement)):
64
        S = [S[i:i+n] for i in range(len(S)-n+1)]
65
    if field is None:
66
        field = RealField()
67
    if isinstance(S, list):
68
        P: dict[str, Any] = {}
69
        N = len(S)
70
        eps = field.one() / N
71
        for i in range(N):
72
            c = S[i]
73
            if c in P:
74
                P[c] += eps
75
            else:
76
                P[c] = eps
77
        return DiscreteProbabilitySpace(S, P, field)
78
    raise TypeError("Argument S (= %s) must be a string, list, or tuple.")
79

80

81
def coincidence_index(S, n=1):
82
    """
83
    Return the coincidence index of the string ``S``.
84

85
    EXAMPLES::
86

87
        sage: S = strip_encoding("The cat in the hat.")
88
        sage: coincidence_index(S)
89
        0.120879120879121
90
    """
91
    if not isinstance(S, str):
92
        try:
93
            S.coincidence_index(n)
94
        except AttributeError:
95
            raise TypeError("Argument S (= %s) must be a string.")
96
    S = strip_encoding(S)
97
    N = len(S)-n+1
98
    X: dict[str, int] = {}
99
    for i in range(N):
100
        c = S[i:i+n]
101
        if c in X:
102
            X[c] += 1
103
        else:
104
            X[c] = 1
105
    RR = RealField()
106
    return RR(sum([m*(m-1) for m in X.values()]))/RR(N*(N-1))
107

108

109
def coincidence_discriminant(S, n=2):
110
    """
111
    INPUT:
112

113
    - ``S`` --tuple of strings; e.g. produced as decimation of transposition
114
      ciphertext, or a sample plaintext
115

116
    OUTPUT:
117

118
    A measure of the difference of probability of association of
119
    character pairs, relative to their independent one-character probabilities.
120

121
    EXAMPLES::
122

123
        sage: S = strip_encoding("The cat in the hat.")
124
        sage: coincidence_discriminant([ S[i:i+2] for i in range(len(S)-1) ])
125
        0.0827001855677322
126
    """
127
    if not isinstance(S, (list, tuple)):
128
        raise TypeError("Argument S (= %s) must be a list or tuple" % S)
129
    if n != 2:
130
        raise ValueError("Argument n (= %s) is only implemented for n = 2" % n)
131
    if not all(isinstance(c, (str, StringMonoidElement)) for c in S):
132
        raise TypeError("Argument S (= %s) must be a list of strings.")
133
    if not all(len(c) == n for c in S):
134
        raise ValueError("Argument S (= %s) must be a list of strings of length 2" % S)
135
    X1 = [frequency_distribution([s[i] for s in S]) for i in range(2)]
136
    XX = frequency_distribution(S)
137
    if isinstance(S[0], StringMonoidElement):
138
        M = S[0].parent()
139
        n = M.ngens()
140
        return sum([(XX(M([i, j]))-X1[0](M([i]))*X1[1](M([j])))**2 for i in range(n) for j in range(n)])
141
    AZ = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
142
    return sum([(XX(AZ[i]+AZ[j])-X1[0](AZ[i])*X1[1](AZ[j]))**2 for i in range(26) for j in range(26)])
143

144
Product

Resources

Company