Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sagemath
GitHub Repository: sagemath/sagelib
Path: blob/master/sage/monoids/string_monoid_element.py
4045 views
1
"""
2
String Monoid Elements
3
4
AUTHORS:
5
6
- David Kohel <[email protected]>, 2007-01
7
8
Elements of free string monoids, internal representation subject to change.
9
10
These are special classes of free monoid elements with distinct printing.
11
12
The internal representation of elements does not use the exponential
13
compression of FreeMonoid elements (a feature), and could be packed into words.
14
"""
15
16
#*****************************************************************************
17
# Copyright (C) 2007 David Kohel <[email protected]>
18
#
19
# Distributed under the terms of the GNU General Public License (GPL)
20
#
21
# http://www.gnu.org/licenses/
22
#*****************************************************************************
23
24
# import operator
25
from sage.rings.integer import Integer
26
from sage.rings.all import RealField
27
# from sage.structure.element import MonoidElement
28
from sage.probability.random_variable import DiscreteProbabilitySpace
29
from free_monoid_element import FreeMonoidElement
30
import string_monoid
31
32
def is_StringMonoidElement(x):
33
r"""
34
"""
35
return isinstance(x, StringMonoidElement)
36
37
def is_AlphabeticStringMonoidElement(x):
38
r"""
39
"""
40
return isinstance(x, StringMonoidElement) and \
41
isinstance(x.parent(), string_monoid.AlphabeticStringMonoid)
42
43
def is_BinaryStringMonoidElement(x):
44
r"""
45
"""
46
return isinstance(x, StringMonoidElement) and \
47
isinstance(x.parent(), string_monoid.BinaryStringMonoid)
48
49
def is_OctalStringMonoidElement(x):
50
r"""
51
"""
52
return isinstance(x, StringMonoidElement) and \
53
isinstance(x.parent(), string_monoid.OctalStringMonoid)
54
55
def is_HexadecimalStringMonoidElement(x):
56
r"""
57
"""
58
return isinstance(x, StringMonoidElement) and \
59
isinstance(x.parent(), string_monoid.HexadecimalStringMonoid)
60
61
def is_Radix64StringMonoidElement(x):
62
r"""
63
"""
64
return isinstance(x, StringMonoidElement) and \
65
isinstance(x.parent(), string_monoid.Radix64StringMonoid)
66
67
68
class StringMonoidElement(FreeMonoidElement):
69
"""
70
Element of a free string monoid.
71
"""
72
73
def __init__(self, S, x, check=True):
74
"""
75
Create the element ``x`` of the StringMonoid ``S``.
76
77
This should typically be called by a StringMonoid.
78
"""
79
FreeMonoidElement.__init__(self, S, [])
80
if isinstance(x, list):
81
if check:
82
for b in x:
83
if not isinstance(b, (int, long, Integer)):
84
raise TypeError(
85
"x (= %s) must be a list of integers." % x)
86
self._element_list = list(x) # make copy
87
elif isinstance(x, str):
88
alphabet = list(self.parent().alphabet())
89
self._element_list = []
90
for i in range(len(x)):
91
try:
92
b = alphabet.index(x[i])
93
except ValueError:
94
raise TypeError(
95
"Argument x (= %s) is not a valid string." % x)
96
self._element_list += [b]
97
else:
98
raise TypeError("Argument x (= %s) is of the wrong type." % x)
99
100
def __cmp__(left, right):
101
"""
102
Compare two free monoid elements with the same parents.
103
104
The ordering is the one on the underlying sorted list of
105
(monomial,coefficients) pairs.
106
107
EXAMPLES::
108
109
sage: S = BinaryStrings()
110
sage: (x,y) = S.gens()
111
sage: x * y < y * x
112
True
113
sage: S("01") < S("10")
114
True
115
"""
116
return cmp(left._element_list, right._element_list)
117
118
def _repr_(self):
119
"""
120
The self-representation of a string monoid element. Unlike Python
121
strings we print without the enclosing quotes.
122
"""
123
S = self.parent()
124
alphabet = S.alphabet()
125
s = ''
126
for b in self._element_list:
127
c = alphabet[b]
128
s += c
129
return s
130
131
def _latex_(self):
132
"""
133
Return latex representation of self.
134
135
EXAMPLES::
136
137
sage: S = BinaryStrings()
138
sage: s = S('101111000')
139
sage: latex(s)
140
101111000
141
"""
142
return self._repr_()
143
144
def __mul__(self, y):
145
"""
146
Multiply 2 free string monoid elements.
147
148
EXAMPLES::
149
150
sage: S = BinaryStrings()
151
sage: (x,y) = S.gens()
152
sage: x*y
153
01
154
"""
155
if not isinstance(y, StringMonoidElement):
156
raise TypeError("Argument y (= %s) is of wrong type." % y)
157
S = self.parent()
158
x_elt = self._element_list
159
y_elt = y._element_list
160
z = S('')
161
z._element_list = x_elt + y_elt
162
return z
163
164
def __pow__(self, n):
165
"""
166
Return the `n`-th power of the string element.
167
168
EXAMPLES::
169
170
sage: (x,y) = BinaryStrings().gens()
171
sage: x**3 * y**5 * x**7
172
000111110000000
173
sage: x**0
174
175
176
Note that raising to a negative power is *not* a constructor
177
for an element of the corresponding free group (yet).
178
179
::
180
181
sage: x**(-1)
182
Traceback (most recent call last):
183
...
184
IndexError: Argument n (= -1) must be non-negative.
185
"""
186
if not isinstance(n, (int, long, Integer)):
187
raise TypeError("Argument n (= %s) must be an integer." % n)
188
if n < 0:
189
raise IndexError("Argument n (= %s) must be non-negative." % n)
190
elif n == 0:
191
return self.parent()('')
192
elif n == 1:
193
return self
194
z = self.parent()('')
195
z._element_list = self._element_list * n
196
return z
197
198
def __len__(self):
199
"""
200
Return the number of products that occur in this monoid element.
201
For example, the length of the identity is 0, and the length
202
of the monoid `x_0^2x_1` is three.
203
204
EXAMPLES::
205
206
sage: S = BinaryStrings()
207
sage: z = S('')
208
sage: len(z)
209
0
210
sage: (x,y) = S.gens()
211
sage: len(x**2 * y**3)
212
5
213
"""
214
return len(self._element_list)
215
216
def __iter__(self):
217
"""
218
Return an iterator over this element as a string.
219
220
EXAMPLES::
221
222
sage: t = AlphabeticStrings()('SHRUBBERY')
223
sage: t.__iter__().next()
224
S
225
sage: list(t)
226
[S, H, R, U, B, B, E, R, Y]
227
"""
228
l = len(self._element_list)
229
i=0
230
while i < l:
231
yield self[i]
232
i+=1
233
raise StopIteration
234
235
def __getitem__(self, n):
236
"""
237
Return the n-th string character.
238
239
EXAMPLES::
240
241
sage: t = AlphabeticStrings()('SHRUBBERY')
242
sage: t[0]
243
S
244
sage: t[3]
245
U
246
sage: t[-1]
247
Y
248
"""
249
try:
250
c = self._element_list[n]
251
except:
252
raise IndexError("Argument n (= %s) is not a valid index." % n)
253
if not isinstance(c, list):
254
c = [c]
255
return self.parent()(c)
256
257
def decoding(self, padic=False):
258
r"""
259
The byte string associated to a binary or hexadecimal string
260
monoid element.
261
262
EXAMPLES::
263
264
sage: S = HexadecimalStrings()
265
sage: s = S.encoding("A..Za..z"); s
266
412e2e5a612e2e7a
267
sage: s.decoding()
268
'A..Za..z'
269
sage: s = S.encoding("A..Za..z",padic=True); s
270
14e2e2a516e2e2a7
271
sage: s.decoding()
272
'\x14\xe2\xe2\xa5\x16\xe2\xe2\xa7'
273
sage: s.decoding(padic=True)
274
'A..Za..z'
275
sage: S = BinaryStrings()
276
sage: s = S.encoding("A..Za..z"); s
277
0100000100101110001011100101101001100001001011100010111001111010
278
sage: s.decoding()
279
'A..Za..z'
280
sage: s = S.encoding("A..Za..z",padic=True); s
281
1000001001110100011101000101101010000110011101000111010001011110
282
sage: s.decoding()
283
'\x82ttZ\x86tt^'
284
sage: s.decoding(padic=True)
285
'A..Za..z'
286
"""
287
S = self.parent()
288
from Crypto.Util.number import long_to_bytes
289
if isinstance(S, string_monoid.AlphabeticStringMonoid):
290
return ''.join([ long_to_bytes(65+i) for i in self._element_list ])
291
n = self.__len__()
292
if isinstance(S, string_monoid.HexadecimalStringMonoid):
293
if not n % 2 == 0:
294
"String %s must have even length to determine a byte character string." % str(self)
295
s = []
296
x = self._element_list
297
for k in range(n//2):
298
m = 2*k
299
if padic:
300
c = long_to_bytes(x[m]+16*x[m+1])
301
else:
302
c = long_to_bytes(16*x[m]+x[m+1])
303
s.append(c)
304
return ''.join(s)
305
if isinstance(S, string_monoid.BinaryStringMonoid):
306
if not n % 8 == 0:
307
"String %s must have even length 0 mod 8 to determine a byte character string." % str(self)
308
pows = [ 2**i for i in range(8) ]
309
s = []
310
x = self._element_list
311
for k in range(n//8):
312
m = 8*k
313
if padic:
314
c = long_to_bytes(sum([ x[m+i]*pows[i] for i in range(8) ]))
315
else:
316
c = long_to_bytes(sum([ x[m+7-i]*pows[i] for i in range(8) ]))
317
s.append(c)
318
return ''.join(s)
319
raise TypeError(
320
"Argument %s must be an alphabetic, binary, or hexadecimal string." % str(self))
321
322
def coincidence_index(self, prec=0):
323
"""
324
Returns the probability of two randomly chosen characters being equal.
325
"""
326
if prec == 0:
327
RR = RealField()
328
else:
329
RR = RealField(prec)
330
char_dict = {}
331
for i in self._element_list:
332
# if char_dict.has_key(i):
333
# The method .has_key() has been deprecated since Python 2.2. Use
334
# "k in Dict" instead of "Dict.has_key(k)".
335
if i in char_dict:
336
char_dict[i] += 1
337
else:
338
char_dict[i] = 1
339
nn = 0
340
ci_num = 0
341
for i in char_dict.keys():
342
ni = char_dict[i]
343
nn += ni
344
ci_num += ni*(ni-1)
345
ci_den = nn*(nn-1)
346
return RR(ci_num)/ci_den
347
348
def character_count(self):
349
r"""
350
Return the count of each unique character.
351
352
EXAMPLES:
353
354
Count the character frequency in an object comprised of capital
355
letters of the English alphabet::
356
357
sage: M = AlphabeticStrings().encoding("abcabf")
358
sage: sorted(M.character_count().items())
359
[(A, 2), (B, 2), (C, 1), (F, 1)]
360
361
In an object comprised of binary numbers::
362
363
sage: M = BinaryStrings().encoding("abcabf")
364
sage: sorted(M.character_count().items())
365
[(0, 28), (1, 20)]
366
367
In an object comprised of octal numbers::
368
369
sage: A = OctalStrings()
370
sage: M = A([1, 2, 3, 2, 5, 3])
371
sage: sorted(M.character_count().items())
372
[(1, 1), (2, 2), (3, 2), (5, 1)]
373
374
In an object comprised of hexadecimal numbers::
375
376
sage: A = HexadecimalStrings()
377
sage: M = A([1, 2, 4, 6, 2, 4, 15])
378
sage: sorted(M.character_count().items())
379
[(1, 1), (2, 2), (4, 2), (6, 1), (f, 1)]
380
381
In an object comprised of radix-64 characters::
382
383
sage: A = Radix64Strings()
384
sage: M = A([1, 2, 63, 45, 45, 10]); M
385
BC/ttK
386
sage: sorted(M.character_count().items())
387
[(B, 1), (C, 1), (K, 1), (t, 2), (/, 1)]
388
389
TESTS:
390
391
Empty strings return no counts of character frequency::
392
393
sage: M = AlphabeticStrings().encoding("")
394
sage: M.character_count()
395
{}
396
sage: M = BinaryStrings().encoding("")
397
sage: M.character_count()
398
{}
399
sage: A = OctalStrings()
400
sage: M = A([])
401
sage: M.character_count()
402
{}
403
sage: A = HexadecimalStrings()
404
sage: M = A([])
405
sage: M.character_count()
406
{}
407
sage: A = Radix64Strings()
408
sage: M = A([])
409
sage: M.character_count()
410
{}
411
"""
412
# the character frequency, i.e. the character count
413
CF = {}
414
for e in self:
415
if e in CF:
416
CF[e] += 1
417
else:
418
CF.setdefault(e, 1)
419
return CF
420
421
def frequency_distribution(self, length=1, prec=0):
422
"""
423
Returns the probability space of character frequencies. The output
424
of this method is different from that of the method
425
:func:`characteristic_frequency()
426
<sage.monoids.string_monoid.AlphabeticStringMonoid.characteristic_frequency>`.
427
One can think of the characteristic frequency probability of an
428
element in an alphabet `A` as the expected probability of that element
429
occurring. Let `S` be a string encoded using elements of `A`. The
430
frequency probability distribution corresponding to `S` provides us
431
with the frequency probability of each element of `A` as observed
432
occurring in `S`. Thus one distribution provides expected
433
probabilities, while the other provides observed probabilities.
434
435
INPUT:
436
437
- ``length`` -- (default ``1``) if ``length=1`` then consider the
438
probability space of monogram frequency, i.e. probability
439
distribution of single characters. If ``length=2`` then consider
440
the probability space of digram frequency, i.e. probability
441
distribution of pairs of characters. This method currently
442
supports the generation of probability spaces for monogram
443
frequency (``length=1``) and digram frequency (``length=2``).
444
445
- ``prec`` -- (default ``0``) a non-negative integer representing
446
the precision (in number of bits) of a floating-point number. The
447
default value ``prec=0`` means that we use 53 bits to represent
448
the mantissa of a floating-point number. For more information on
449
the precision of floating-point numbers, see the function
450
:func:`RealField() <sage.rings.real_mpfr.RealField>` or refer to the module
451
:mod:`real_mpfr <sage.rings.real_mpfr>`.
452
453
EXAMPLES:
454
455
Capital letters of the English alphabet::
456
457
sage: M = AlphabeticStrings().encoding("abcd")
458
sage: L = M.frequency_distribution().function()
459
sage: sorted(L.items())
460
<BLANKLINE>
461
[(A, 0.250000000000000),
462
(B, 0.250000000000000),
463
(C, 0.250000000000000),
464
(D, 0.250000000000000)]
465
466
The binary number system::
467
468
sage: M = BinaryStrings().encoding("abcd")
469
sage: L = M.frequency_distribution().function()
470
sage: sorted(L.items())
471
[(0, 0.593750000000000), (1, 0.406250000000000)]
472
473
The hexadecimal number system::
474
475
sage: M = HexadecimalStrings().encoding("abcd")
476
sage: L = M.frequency_distribution().function()
477
sage: sorted(L.items())
478
<BLANKLINE>
479
[(1, 0.125000000000000),
480
(2, 0.125000000000000),
481
(3, 0.125000000000000),
482
(4, 0.125000000000000),
483
(6, 0.500000000000000)]
484
485
Get the observed frequency probability distribution of digrams in the
486
string "ABCD". This string consists of the following digrams: "AB",
487
"BC", and "CD". Now find out the frequency probability of each of
488
these digrams as they occur in the string "ABCD"::
489
490
sage: M = AlphabeticStrings().encoding("abcd")
491
sage: D = M.frequency_distribution(length=2).function()
492
sage: sorted(D.items())
493
[(AB, 0.333333333333333), (BC, 0.333333333333333), (CD, 0.333333333333333)]
494
"""
495
if not length in (1, 2):
496
raise NotImplementedError("Not implemented")
497
if prec == 0:
498
RR = RealField()
499
else:
500
RR = RealField(prec)
501
S = self.parent()
502
n = S.ngens()
503
if length == 1:
504
Alph = S.gens()
505
else:
506
Alph = tuple([ x*y for x in S.gens() for y in S.gens() ])
507
X = {}
508
N = len(self)-length+1
509
eps = RR(Integer(1)/N)
510
for i in range(N):
511
c = self[i:i+length]
512
# if X.has_key(c):
513
# The method .has_key() has been deprecated since Python 2.2. Use
514
# "k in Dict" instead of "Dict.has_key(k)".
515
if c in X:
516
X[c] += eps
517
else:
518
X[c] = eps
519
# Return a dictionary of probability distribution. This should
520
# allow for easier parsing of the dictionary.
521
return DiscreteProbabilitySpace(Alph, X, RR)
522
523