Path: blob/master/src/SentinelUtilities/SentinelAnomalyLookup/anomaly_finder.py
3255 views
# -------------------------------------------------------------------------1# Copyright (c) Microsoft Corporation. All rights reserved.2# Licensed under the MIT License. See License.txt in the project root for3# license information.4# --------------------------------------------------------------------------5"""6Anomaly Finder module:7This module has two classes: AnomalyQueries and AnomalyFinder8"""910import copy11import datetime as dt12import pandas as pd13from pandas.io.json import json_normalize14from azure.loganalytics.models import QueryBody1516from SentinelUtils.obfuscation_utility import ObfuscationUtility17from SentinelLog.log import Log18from .anomaly_lookup_view_helper import AnomalyLookupViewHelper192021class AnomalyQueries(): # pylint: disable=too-few-public-methods22""" KQLs for anomaly lookup """2324KEY = b'7jH91qslpgxNSdVPiQwtr2D4bSg2i5ArfJqAMA1zCkU='25QUERIES = {}26# pylint: disable=line-too-long27QUERIES['LISTTABLES'] = b'gAAAAABfdL95MQF9HV5YIbGGKy9kWPKXpbH3VWf80l9elru76D6sw8JN-shOQSYcWfSW43PfW7LSoShP14zl8_iu1-RC-mFYX_ZxiUBTZ5H-MHZZIAU0-nL1QOJ3EgJqHWpU79Kf9VsxjuKDf6oedEDnveb5MaM2Li-qQeJ7edUb1JknuQD5JFKRpqr_yE1jBQO3RPNxyXg9rhFPkNsTTxyHXZLE3NWQMA=='28# pylint: disable=line-too-long29QUERIES['ISCATCOLUMN'] = b'gAAAAABdNjkBT7XTAvPjCp56dI7LRL35EPMzF8UX_c4Hs0q910bTI2y-6viJGG7ZlXjclORrXFK68dAd2dYfLB0A_rw0Q9CeLEGUuWkHSFc7al2xwu7uEgliddsQQyocWhqyWlEtFEReIeJaqZYaSOkLon7sPN_icKEwiHc096kCkyjw5D0TeZ0Kgmnc5799Al7ND0kAk7KO'30# pylint: disable=line-too-long31QUERIES['ISCATHEURISTIC'] = b'gAAAAABdNjkglsyzKMCUkIXq3aqcim0F70S86HfAqaiyNUIF0La2st1DkiQTFK_vqVIyxiY25i78FiT6y0yZG4YQmpAVVwRJ302KkeAPVq0mPPK1FKbRcnnqmIc1HOAycyv3dmDHDUG7-_b-wyy8IDneWYyTE3TxyLUcG3kJRTmQd_6-hLXfDUjctjm0VPrA9zcrN8Il1y-nq-4jOsFZbO0qvHZfnLBTwaf52hkXPqmkZN9Rz-deW6Q4VY_j7Vw7rRrXM5WxRchL2kJBhGOq-hM8A3W9AA7qHnkgVu2BOVEYZAT_EnHvks8YMcWP04WKs49Dw5Ga4UMqJeU4MJH8PKfNmn7zcuLwMA=='32# pylint: disable=line-too-long33QUERIES['TIMESERIESANOMALYDETECTION'] = b'gAAAAABeNN8ojnOJUuvi1_FI8bqz200wGX1CPFnxU8FHDnwmFx7Ywm54WwbFXMWXVIpiLf9zjv5Fcl85wdyBdA6KS7_xph89Geb5CQOoqMGt_-syZ_KgE4CXQoCDCfWmCYXlx2zzIZX5g88SfVJeLmWbCUsk20KTO0Ecdt6TauIUuLkCh85v55_j8RSxXvwZy4y-WAate9hZh1xoRV5fjvq1_ox2V_qpJO-HpzdCuZGelMx3DkxL0PUy2_SXLZdaa5HLl0vM0IfmiDqIQgcxnjAEwk6GGezh3FvT43BCwO_HX5PHixCkqozeXzbsLuWxOMpoZ5I1174dVWK-8e9uiVvMfwyOqeC9tlKoAUXAEBsXS41kEbAMTwlK3VcCqq9iTfnW0jUkSlMV7P_JbWOVxErwC4DtIKP5d6AyNcqVhBlOYZwouqoY734vBhDsx95bzN2mwwL2-_Set_ksdoxTLlFVaWK9MY4tmBCsoQSVHrVuo00K1zPYIh43VgbVm6gFLs7kdjFDraMb5fjx0VxXQrkUkeO3xZ7i9caKl2ODpoVf2ahAPxqPTmHIG9bqQBb2SGu3PakxrRiRanOLY-pJ4eN54fQlYhk-O-1UwpUnna4WVMbqST4tFbEBCpr4IeGNqnaEeNccuHaWHDdjxIEt5sZ402LFJAV0HvMOxOVD3qUjUroHFysJrZXzYJF9HApBbtryokMjgh1YPGAwguniTTyUQolrV_1m54yhJLd7I-39MnBpfobO6sXAtZjPxZfoCCmiJta_JaAuj2sW-8goxU0NBWUxBsiEnPIrHyJKLiduavgID2UcHR4rhByyUg3EzbeKFO-WS-oIVXn0jFUfGjhOKc70Xc1R3L_vvGe2029AYkWmHb36yEJgVlVJ4YQ_7eMSjosZA6R97SDzYXCC1eOopH1hyexwvJwmlzP2gb3NcZ41WbmCTA7vYlT1uH4IqMiuvNWwkZ7MpPIccOnwFWkLFzh_BpgDfvnGy-loqaTNuuLUYzoAyhejzfBQPwG7AlPwH85pfNGarrz7z47uMYQE51-R2gxpDSmw3QvWacKIs3F2g94umQjXJkZ9otiNxPqZceKINig3pFj_SMRiQ1vXRNyKUM8BYptJJ7CFR5TI37lElejgoQBD3VQ7uyK_Ghz8M4cJG1P3ry9d8mk0-vwNClGkb4WGgLR69dPBSZM03uQ14oolEHxuwPQgBoLM-Rlu3YuGQBfbCtqtarLa8IwFBQcbA6WsKJ2dDnCcrRdCuMMTsKuJlSfoU-7jAOwVN-ISg8m47aWbJZtjAODzPxuv9KOQlTfApJcjla37UKKctce9kVkYeoqi6dJnSF9HLsfbRhSIf9bWKHue2ML20urH-0xhIDxvSA=='34# pylint: disable=line-too-long35QUERIES['TIMEWINDOWQUERY'] = b'gAAAAABdOjxI1Cq7frn_5Gj1l2vvA6Eu-a5qghqvRTBc8I9gWdcI8JiALXjpT7qJwf8ZBCKCrwYtMXY2-bp7Cj4jwYVXVDKmXRjoyz0xLbiVdCkIc07U2sNjpwzO1y1OvRr2apYv5Y9_yh_vOpqN4uv1WUezH_z1bXNCO-yI-LMIlidav4Xh5KwRtGBTnXGBk5YidPJVHfnZZGpCQ5w7g4t0ptoM5p6w_eXC8RZ82J3QLIVGtguWISFYweE5GWVJkkUXq3aq3n36uIFl2T3YllLUX2FytfOw_B8Xt1UrspWURfgDx1xqyCnqUEPG_EnO-TuGKFbMkh6AjpcduidHTuuS45YGatPvzRzyAzElLnbj7-s0gc-0POrUyiNaeTj_Tg0wTBsIJklL'36# pylint: disable=line-too-long37QUERIES['ISENTITYINTABLE'] = b'gAAAAABdNkO8YYV6ElbBqI9qp0oLHLquoYJD_7umEu1sDgyHouYcN0jU6vlOPp8AN5lecaMvXPUqQ5ZiFw6393Z9l7kNOB7IMITURv59MZJxeEVpt5ud9F4ge-5JGge5k7ux2YU50z-u9djJYet2SO-n1MpD5xO14ODKtBPsr9guZ40wYJwMzwLCjDSpTXFnIDjYrXDhfU3D2YGc4jnrq2EePBUAPPKxnIXg7AtmnGm4Add1_aV-pDlHMXTn09Z3kvlUcHpHBw7g'3839@staticmethod40def get_query(name):41""" get KQL """4243en_query = AnomalyQueries.QUERIES[name]44obfuscate = ObfuscationUtility(AnomalyQueries.KEY)45query = obfuscate.deobfuscate_text(en_query)46return query474849class AnomalyFinder():50"""51This class provides process flow functions for anomaly lookup.52Method - run is the main entry point.53"""5455def __init__(self, workspace_id, la_data_client):56self.workspace_id = workspace_id57self.la_data_client = la_data_client58self.logger = Log()59self.anomaly = ''6061def query_table_list(self):62""" Get a list of data tables from Log Analytics for the user """6364query = AnomalyQueries.get_query('LISTTABLES')65return self.query_loganalytics(query)6667def query_loganalytics(self, query):68""" This method will call Log Analytics through LA client """6970res = self.la_data_client.query(self.workspace_id, QueryBody(query=query))71json = res.as_dict()72cols = json_normalize(json['tables'][0], 'columns')73data_frame = json_normalize(json['tables'][0], 'rows')74if data_frame.shape[0] != 0:75data_frame.columns = cols.name76return data_frame7778@staticmethod79def construct_related_queries(df_anomalies):80""" This method constructs query for user to repo and can be saves for future references """8182if df_anomalies.shape[0] == 0:83return None8485queries = ''86for tbl in df_anomalies.Table.unique():8788cur_table_anomalies = df_anomalies.loc[df_anomalies.Table == tbl, :]89query = """{tbl} \90| where TimeGenerated > datetime({maxTimestamp})-14d and TimeGenerated < datetime({maxTimestamp}) \91| where {entCol} has "{qEntity}" \92| where """.format(**{93'tbl': tbl,94'qTimestamp': cur_table_anomalies.qTimestamp.iloc[0],95'maxTimestamp': cur_table_anomalies.maxTimestamp.iloc[0],96'entCol': cur_table_anomalies.entCol.iloc[0],97'qEntity': cur_table_anomalies.qEntity.iloc[0]98})99100for j, row in cur_table_anomalies.iterrows(): # pylint: disable=unused-variable101query += " {col} == to{colType}(\"{colVal}\") or".format(102col=row.colName,103colType=(row.colType) if 'colType' in row.keys() else 'string',104colVal=row.colVal.replace('"', '')105)106107query = query[:-2] # drop the last or108query += " | take 1000; " # limit the output size109query = query.replace("\\", "\\\\")110111queries += query112return queries113114# pylint: disable=too-many-locals115def get_timewindow(self, q_entity, q_timestamp, ent_col, tbl):116""" find the relevant time window for analysis """117118win_start = 0119min_timestamp = None120delta = None121max_timestamp = None122long_min_timestamp = None123time_window_query_template = AnomalyQueries.get_query('TIMEWINDOWQUERY')124125for from_hour in range(-30, 0, 1):126kql_time_range_d = time_window_query_template.format(127table=tbl,128qDate=q_timestamp,129entColumn=ent_col,130qEntity=q_entity,131f=from_hour,132t=from_hour+1,133delta='d')134135df_time_range = self.query_loganalytics(kql_time_range_d)136137if df_time_range.shape[0] > 0:138win_start = from_hour139break140141dt_q_timestamp = pd.to_datetime(q_timestamp)142ind2now = dt.datetime.utcnow() - dt_q_timestamp143if win_start < -3:144if ind2now > dt.timedelta(days=1):145delta = '1d'146max_timestamp = dt_q_timestamp + dt.timedelta(days=1)147else:148delta = '1d'149max_timestamp = dt.datetime.now()150long_min_timestamp = max_timestamp + dt.timedelta(days=win_start)151min_timestamp = max_timestamp + dt.timedelta(days=max([-6, win_start]))152153elif win_start < 0: # switch to hours154win_start_hour = -5155for from_hour in range(-3*24, -5, 1):156kql_time_range_h = time_window_query_template.format(157table=tbl,158qDate=q_timestamp,159entColumn=ent_col,160qEntity=q_entity,161f=from_hour,162t=from_hour+1,163delta='h')164165df_time_range = self.query_loganalytics(kql_time_range_h)166167if df_time_range.shape[0] > 0:168win_start_hour = from_hour169break170if win_start_hour < -5:171if ind2now > dt.timedelta(hours=1):172delta = '1h'173max_timestamp = dt_q_timestamp + dt.timedelta(hours=1)174else:175delta = '1h'176max_timestamp = dt.datetime.now()177min_timestamp = max_timestamp + dt.timedelta(hours=win_start_hour)178long_min_timestamp = min_timestamp179180return min_timestamp, delta, max_timestamp, long_min_timestamp181182# pylint: disable=too-many-locals183def run(self, q_timestamp, q_entity, tables):184""" Main function for Anomaly Lookup """185186progress_bar = AnomalyLookupViewHelper.define_int_progress_bar()187display(progress_bar) # pylint: disable=undefined-variable188189# list tables if not given190if not tables:191kql_list_tables = AnomalyQueries.get_query('LISTTABLES')192tables = self.query_loganalytics(kql_list_tables)193tables = tables.SentinelTableName.tolist()194195progress_bar.value += 1196197# find the column in which the query entity appears in each table198# - assumption that it appears in just one columns199tables2search = []200is_entity_in_table_template = AnomalyQueries.get_query('ISENTITYINTABLE')201202for tbl in tables:203kql_entity_in_table = is_entity_in_table_template.format(204table=tbl,205qDate=q_timestamp,206qEntity=q_entity)207ent_in_table = self.query_loganalytics(kql_entity_in_table)208209if ent_in_table.shape[0] > 0:210ent_col = [col for col in ent_in_table.select_dtypes('object').columns[1:] if211ent_in_table.loc[0, col] is not None212and ent_in_table.loc[:, col].str.contains(q_entity, case=False).all()]213if ent_col:214ent_col = ent_col[0]215tables2search.append({'table': tbl, 'entCol': ent_col})216217progress_bar.value += 2218219# for each table, find the time window to query on220for tbl in tables2search:221tbl['minTimestamp'], tbl['delta'], tbl['maxTimestamp'], tbl['longMinTimestamp'] = \222self.get_timewindow(q_entity, q_timestamp, tbl['entCol'], tbl['table'])223224progress_bar.value += 1225226# identify all the categorical columns per table on which we will find anomalies227categorical_cols = []228is_cat_column_template = AnomalyQueries.get_query('ISCATCOLUMN')229is_cat_heuristic_template = AnomalyQueries.get_query('ISCATHEURISTIC')230for tbl in tables2search:231kql_is_cat_column = is_cat_column_template.format(table=tbl['table'])232df_cols = self.query_loganalytics(kql_is_cat_column)233234for col in df_cols.ColumnName:235kql_is_cat_heuristic = is_cat_heuristic_template.format(236table=tbl['table'],237column=col)238df_is_cat = self.query_loganalytics(kql_is_cat_heuristic)239240if df_is_cat.shape[0] > 0:241cat_col_info = copy.deepcopy(tbl)242cat_col_info['col'] = col243categorical_cols.append(cat_col_info)244245progress_bar.value += 2246247anomalies_list = []248time_series_anomaly_detection_template = \249AnomalyQueries.get_query('TIMESERIESANOMALYDETECTION')250for col_info in categorical_cols:251max_timestamp = col_info['maxTimestamp'].strftime('%Y-%m-%dT%H:%M:%S.%f')252long_min_timestamp = col_info['longMinTimestamp'].strftime('%Y-%m-%dT%H:%M:%S.%f')253254kql_time_series_anomaly_detection = time_series_anomaly_detection_template.format(255table=col_info['table'],256column=col_info['col'],257entColumn=col_info['entCol'],258qEntity=q_entity,259minTimestamp=long_min_timestamp,260maxTimestamp=max_timestamp,261qTimestamp=q_timestamp,262delta=col_info['delta'])263264cur_anomalies = self.query_loganalytics(kql_time_series_anomaly_detection)265266anomalies_list.append(cur_anomalies)267268progress_bar.value += 2269270if anomalies_list:271anomalies = pd.concat(anomalies_list, axis=0)272else:273anomalies = pd.DataFrame()274275progress_bar.value += 2276queries = AnomalyFinder.construct_related_queries(anomalies)277progress_bar.close()278self.anomaly = str(anomalies.to_json(orient='records'))279280return anomalies, queries281# End of the Module #282283284