Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Azure
GitHub Repository: Azure/Azure-Sentinel-Notebooks
Path: blob/master/src/SentinelUtilities/SentinelAnomalyLookup/anomaly_finder.py
3255 views
1
# -------------------------------------------------------------------------
2
# Copyright (c) Microsoft Corporation. All rights reserved.
3
# Licensed under the MIT License. See License.txt in the project root for
4
# license information.
5
# --------------------------------------------------------------------------
6
"""
7
Anomaly Finder module:
8
This module has two classes: AnomalyQueries and AnomalyFinder
9
"""
10
11
import copy
12
import datetime as dt
13
import pandas as pd
14
from pandas.io.json import json_normalize
15
from azure.loganalytics.models import QueryBody
16
17
from SentinelUtils.obfuscation_utility import ObfuscationUtility
18
from SentinelLog.log import Log
19
from .anomaly_lookup_view_helper import AnomalyLookupViewHelper
20
21
22
class AnomalyQueries(): # pylint: disable=too-few-public-methods
23
""" KQLs for anomaly lookup """
24
25
KEY = b'7jH91qslpgxNSdVPiQwtr2D4bSg2i5ArfJqAMA1zCkU='
26
QUERIES = {}
27
# pylint: disable=line-too-long
28
QUERIES['LISTTABLES'] = b'gAAAAABfdL95MQF9HV5YIbGGKy9kWPKXpbH3VWf80l9elru76D6sw8JN-shOQSYcWfSW43PfW7LSoShP14zl8_iu1-RC-mFYX_ZxiUBTZ5H-MHZZIAU0-nL1QOJ3EgJqHWpU79Kf9VsxjuKDf6oedEDnveb5MaM2Li-qQeJ7edUb1JknuQD5JFKRpqr_yE1jBQO3RPNxyXg9rhFPkNsTTxyHXZLE3NWQMA=='
29
# pylint: disable=line-too-long
30
QUERIES['ISCATCOLUMN'] = b'gAAAAABdNjkBT7XTAvPjCp56dI7LRL35EPMzF8UX_c4Hs0q910bTI2y-6viJGG7ZlXjclORrXFK68dAd2dYfLB0A_rw0Q9CeLEGUuWkHSFc7al2xwu7uEgliddsQQyocWhqyWlEtFEReIeJaqZYaSOkLon7sPN_icKEwiHc096kCkyjw5D0TeZ0Kgmnc5799Al7ND0kAk7KO'
31
# pylint: disable=line-too-long
32
QUERIES['ISCATHEURISTIC'] = b'gAAAAABdNjkglsyzKMCUkIXq3aqcim0F70S86HfAqaiyNUIF0La2st1DkiQTFK_vqVIyxiY25i78FiT6y0yZG4YQmpAVVwRJ302KkeAPVq0mPPK1FKbRcnnqmIc1HOAycyv3dmDHDUG7-_b-wyy8IDneWYyTE3TxyLUcG3kJRTmQd_6-hLXfDUjctjm0VPrA9zcrN8Il1y-nq-4jOsFZbO0qvHZfnLBTwaf52hkXPqmkZN9Rz-deW6Q4VY_j7Vw7rRrXM5WxRchL2kJBhGOq-hM8A3W9AA7qHnkgVu2BOVEYZAT_EnHvks8YMcWP04WKs49Dw5Ga4UMqJeU4MJH8PKfNmn7zcuLwMA=='
33
# pylint: disable=line-too-long
34
QUERIES['TIMESERIESANOMALYDETECTION'] = b'gAAAAABeNN8ojnOJUuvi1_FI8bqz200wGX1CPFnxU8FHDnwmFx7Ywm54WwbFXMWXVIpiLf9zjv5Fcl85wdyBdA6KS7_xph89Geb5CQOoqMGt_-syZ_KgE4CXQoCDCfWmCYXlx2zzIZX5g88SfVJeLmWbCUsk20KTO0Ecdt6TauIUuLkCh85v55_j8RSxXvwZy4y-WAate9hZh1xoRV5fjvq1_ox2V_qpJO-HpzdCuZGelMx3DkxL0PUy2_SXLZdaa5HLl0vM0IfmiDqIQgcxnjAEwk6GGezh3FvT43BCwO_HX5PHixCkqozeXzbsLuWxOMpoZ5I1174dVWK-8e9uiVvMfwyOqeC9tlKoAUXAEBsXS41kEbAMTwlK3VcCqq9iTfnW0jUkSlMV7P_JbWOVxErwC4DtIKP5d6AyNcqVhBlOYZwouqoY734vBhDsx95bzN2mwwL2-_Set_ksdoxTLlFVaWK9MY4tmBCsoQSVHrVuo00K1zPYIh43VgbVm6gFLs7kdjFDraMb5fjx0VxXQrkUkeO3xZ7i9caKl2ODpoVf2ahAPxqPTmHIG9bqQBb2SGu3PakxrRiRanOLY-pJ4eN54fQlYhk-O-1UwpUnna4WVMbqST4tFbEBCpr4IeGNqnaEeNccuHaWHDdjxIEt5sZ402LFJAV0HvMOxOVD3qUjUroHFysJrZXzYJF9HApBbtryokMjgh1YPGAwguniTTyUQolrV_1m54yhJLd7I-39MnBpfobO6sXAtZjPxZfoCCmiJta_JaAuj2sW-8goxU0NBWUxBsiEnPIrHyJKLiduavgID2UcHR4rhByyUg3EzbeKFO-WS-oIVXn0jFUfGjhOKc70Xc1R3L_vvGe2029AYkWmHb36yEJgVlVJ4YQ_7eMSjosZA6R97SDzYXCC1eOopH1hyexwvJwmlzP2gb3NcZ41WbmCTA7vYlT1uH4IqMiuvNWwkZ7MpPIccOnwFWkLFzh_BpgDfvnGy-loqaTNuuLUYzoAyhejzfBQPwG7AlPwH85pfNGarrz7z47uMYQE51-R2gxpDSmw3QvWacKIs3F2g94umQjXJkZ9otiNxPqZceKINig3pFj_SMRiQ1vXRNyKUM8BYptJJ7CFR5TI37lElejgoQBD3VQ7uyK_Ghz8M4cJG1P3ry9d8mk0-vwNClGkb4WGgLR69dPBSZM03uQ14oolEHxuwPQgBoLM-Rlu3YuGQBfbCtqtarLa8IwFBQcbA6WsKJ2dDnCcrRdCuMMTsKuJlSfoU-7jAOwVN-ISg8m47aWbJZtjAODzPxuv9KOQlTfApJcjla37UKKctce9kVkYeoqi6dJnSF9HLsfbRhSIf9bWKHue2ML20urH-0xhIDxvSA=='
35
# pylint: disable=line-too-long
36
QUERIES['TIMEWINDOWQUERY'] = b'gAAAAABdOjxI1Cq7frn_5Gj1l2vvA6Eu-a5qghqvRTBc8I9gWdcI8JiALXjpT7qJwf8ZBCKCrwYtMXY2-bp7Cj4jwYVXVDKmXRjoyz0xLbiVdCkIc07U2sNjpwzO1y1OvRr2apYv5Y9_yh_vOpqN4uv1WUezH_z1bXNCO-yI-LMIlidav4Xh5KwRtGBTnXGBk5YidPJVHfnZZGpCQ5w7g4t0ptoM5p6w_eXC8RZ82J3QLIVGtguWISFYweE5GWVJkkUXq3aq3n36uIFl2T3YllLUX2FytfOw_B8Xt1UrspWURfgDx1xqyCnqUEPG_EnO-TuGKFbMkh6AjpcduidHTuuS45YGatPvzRzyAzElLnbj7-s0gc-0POrUyiNaeTj_Tg0wTBsIJklL'
37
# pylint: disable=line-too-long
38
QUERIES['ISENTITYINTABLE'] = b'gAAAAABdNkO8YYV6ElbBqI9qp0oLHLquoYJD_7umEu1sDgyHouYcN0jU6vlOPp8AN5lecaMvXPUqQ5ZiFw6393Z9l7kNOB7IMITURv59MZJxeEVpt5ud9F4ge-5JGge5k7ux2YU50z-u9djJYet2SO-n1MpD5xO14ODKtBPsr9guZ40wYJwMzwLCjDSpTXFnIDjYrXDhfU3D2YGc4jnrq2EePBUAPPKxnIXg7AtmnGm4Add1_aV-pDlHMXTn09Z3kvlUcHpHBw7g'
39
40
@staticmethod
41
def get_query(name):
42
""" get KQL """
43
44
en_query = AnomalyQueries.QUERIES[name]
45
obfuscate = ObfuscationUtility(AnomalyQueries.KEY)
46
query = obfuscate.deobfuscate_text(en_query)
47
return query
48
49
50
class AnomalyFinder():
51
"""
52
This class provides process flow functions for anomaly lookup.
53
Method - run is the main entry point.
54
"""
55
56
def __init__(self, workspace_id, la_data_client):
57
self.workspace_id = workspace_id
58
self.la_data_client = la_data_client
59
self.logger = Log()
60
self.anomaly = ''
61
62
def query_table_list(self):
63
""" Get a list of data tables from Log Analytics for the user """
64
65
query = AnomalyQueries.get_query('LISTTABLES')
66
return self.query_loganalytics(query)
67
68
def query_loganalytics(self, query):
69
""" This method will call Log Analytics through LA client """
70
71
res = self.la_data_client.query(self.workspace_id, QueryBody(query=query))
72
json = res.as_dict()
73
cols = json_normalize(json['tables'][0], 'columns')
74
data_frame = json_normalize(json['tables'][0], 'rows')
75
if data_frame.shape[0] != 0:
76
data_frame.columns = cols.name
77
return data_frame
78
79
@staticmethod
80
def construct_related_queries(df_anomalies):
81
""" This method constructs query for user to repo and can be saves for future references """
82
83
if df_anomalies.shape[0] == 0:
84
return None
85
86
queries = ''
87
for tbl in df_anomalies.Table.unique():
88
89
cur_table_anomalies = df_anomalies.loc[df_anomalies.Table == tbl, :]
90
query = """{tbl} \
91
| where TimeGenerated > datetime({maxTimestamp})-14d and TimeGenerated < datetime({maxTimestamp}) \
92
| where {entCol} has "{qEntity}" \
93
| where """.format(**{
94
'tbl': tbl,
95
'qTimestamp': cur_table_anomalies.qTimestamp.iloc[0],
96
'maxTimestamp': cur_table_anomalies.maxTimestamp.iloc[0],
97
'entCol': cur_table_anomalies.entCol.iloc[0],
98
'qEntity': cur_table_anomalies.qEntity.iloc[0]
99
})
100
101
for j, row in cur_table_anomalies.iterrows(): # pylint: disable=unused-variable
102
query += " {col} == to{colType}(\"{colVal}\") or".format(
103
col=row.colName,
104
colType=(row.colType) if 'colType' in row.keys() else 'string',
105
colVal=row.colVal.replace('"', '')
106
)
107
108
query = query[:-2] # drop the last or
109
query += " | take 1000; " # limit the output size
110
query = query.replace("\\", "\\\\")
111
112
queries += query
113
return queries
114
115
# pylint: disable=too-many-locals
116
def get_timewindow(self, q_entity, q_timestamp, ent_col, tbl):
117
""" find the relevant time window for analysis """
118
119
win_start = 0
120
min_timestamp = None
121
delta = None
122
max_timestamp = None
123
long_min_timestamp = None
124
time_window_query_template = AnomalyQueries.get_query('TIMEWINDOWQUERY')
125
126
for from_hour in range(-30, 0, 1):
127
kql_time_range_d = time_window_query_template.format(
128
table=tbl,
129
qDate=q_timestamp,
130
entColumn=ent_col,
131
qEntity=q_entity,
132
f=from_hour,
133
t=from_hour+1,
134
delta='d')
135
136
df_time_range = self.query_loganalytics(kql_time_range_d)
137
138
if df_time_range.shape[0] > 0:
139
win_start = from_hour
140
break
141
142
dt_q_timestamp = pd.to_datetime(q_timestamp)
143
ind2now = dt.datetime.utcnow() - dt_q_timestamp
144
if win_start < -3:
145
if ind2now > dt.timedelta(days=1):
146
delta = '1d'
147
max_timestamp = dt_q_timestamp + dt.timedelta(days=1)
148
else:
149
delta = '1d'
150
max_timestamp = dt.datetime.now()
151
long_min_timestamp = max_timestamp + dt.timedelta(days=win_start)
152
min_timestamp = max_timestamp + dt.timedelta(days=max([-6, win_start]))
153
154
elif win_start < 0: # switch to hours
155
win_start_hour = -5
156
for from_hour in range(-3*24, -5, 1):
157
kql_time_range_h = time_window_query_template.format(
158
table=tbl,
159
qDate=q_timestamp,
160
entColumn=ent_col,
161
qEntity=q_entity,
162
f=from_hour,
163
t=from_hour+1,
164
delta='h')
165
166
df_time_range = self.query_loganalytics(kql_time_range_h)
167
168
if df_time_range.shape[0] > 0:
169
win_start_hour = from_hour
170
break
171
if win_start_hour < -5:
172
if ind2now > dt.timedelta(hours=1):
173
delta = '1h'
174
max_timestamp = dt_q_timestamp + dt.timedelta(hours=1)
175
else:
176
delta = '1h'
177
max_timestamp = dt.datetime.now()
178
min_timestamp = max_timestamp + dt.timedelta(hours=win_start_hour)
179
long_min_timestamp = min_timestamp
180
181
return min_timestamp, delta, max_timestamp, long_min_timestamp
182
183
# pylint: disable=too-many-locals
184
def run(self, q_timestamp, q_entity, tables):
185
""" Main function for Anomaly Lookup """
186
187
progress_bar = AnomalyLookupViewHelper.define_int_progress_bar()
188
display(progress_bar) # pylint: disable=undefined-variable
189
190
# list tables if not given
191
if not tables:
192
kql_list_tables = AnomalyQueries.get_query('LISTTABLES')
193
tables = self.query_loganalytics(kql_list_tables)
194
tables = tables.SentinelTableName.tolist()
195
196
progress_bar.value += 1
197
198
# find the column in which the query entity appears in each table
199
# - assumption that it appears in just one columns
200
tables2search = []
201
is_entity_in_table_template = AnomalyQueries.get_query('ISENTITYINTABLE')
202
203
for tbl in tables:
204
kql_entity_in_table = is_entity_in_table_template.format(
205
table=tbl,
206
qDate=q_timestamp,
207
qEntity=q_entity)
208
ent_in_table = self.query_loganalytics(kql_entity_in_table)
209
210
if ent_in_table.shape[0] > 0:
211
ent_col = [col for col in ent_in_table.select_dtypes('object').columns[1:] if
212
ent_in_table.loc[0, col] is not None
213
and ent_in_table.loc[:, col].str.contains(q_entity, case=False).all()]
214
if ent_col:
215
ent_col = ent_col[0]
216
tables2search.append({'table': tbl, 'entCol': ent_col})
217
218
progress_bar.value += 2
219
220
# for each table, find the time window to query on
221
for tbl in tables2search:
222
tbl['minTimestamp'], tbl['delta'], tbl['maxTimestamp'], tbl['longMinTimestamp'] = \
223
self.get_timewindow(q_entity, q_timestamp, tbl['entCol'], tbl['table'])
224
225
progress_bar.value += 1
226
227
# identify all the categorical columns per table on which we will find anomalies
228
categorical_cols = []
229
is_cat_column_template = AnomalyQueries.get_query('ISCATCOLUMN')
230
is_cat_heuristic_template = AnomalyQueries.get_query('ISCATHEURISTIC')
231
for tbl in tables2search:
232
kql_is_cat_column = is_cat_column_template.format(table=tbl['table'])
233
df_cols = self.query_loganalytics(kql_is_cat_column)
234
235
for col in df_cols.ColumnName:
236
kql_is_cat_heuristic = is_cat_heuristic_template.format(
237
table=tbl['table'],
238
column=col)
239
df_is_cat = self.query_loganalytics(kql_is_cat_heuristic)
240
241
if df_is_cat.shape[0] > 0:
242
cat_col_info = copy.deepcopy(tbl)
243
cat_col_info['col'] = col
244
categorical_cols.append(cat_col_info)
245
246
progress_bar.value += 2
247
248
anomalies_list = []
249
time_series_anomaly_detection_template = \
250
AnomalyQueries.get_query('TIMESERIESANOMALYDETECTION')
251
for col_info in categorical_cols:
252
max_timestamp = col_info['maxTimestamp'].strftime('%Y-%m-%dT%H:%M:%S.%f')
253
long_min_timestamp = col_info['longMinTimestamp'].strftime('%Y-%m-%dT%H:%M:%S.%f')
254
255
kql_time_series_anomaly_detection = time_series_anomaly_detection_template.format(
256
table=col_info['table'],
257
column=col_info['col'],
258
entColumn=col_info['entCol'],
259
qEntity=q_entity,
260
minTimestamp=long_min_timestamp,
261
maxTimestamp=max_timestamp,
262
qTimestamp=q_timestamp,
263
delta=col_info['delta'])
264
265
cur_anomalies = self.query_loganalytics(kql_time_series_anomaly_detection)
266
267
anomalies_list.append(cur_anomalies)
268
269
progress_bar.value += 2
270
271
if anomalies_list:
272
anomalies = pd.concat(anomalies_list, axis=0)
273
else:
274
anomalies = pd.DataFrame()
275
276
progress_bar.value += 2
277
queries = AnomalyFinder.construct_related_queries(anomalies)
278
progress_bar.close()
279
self.anomaly = str(anomalies.to_json(orient='records'))
280
281
return anomalies, queries
282
# End of the Module #
283
284