Automation Setup - Configure Azure Machine Learning Pipelines

Notebook Version: 1.0
Python Version: Python 3.8 - AzureML
Required Packages: No
Platforms Supported: Azure Machine Learning Notebooks

Data Source Required: No

Description

This is the second notebook of series for setting up Microsoft Sentinel notebook automation platform based on Azure Machine Learning Pipelines.
Before starting this notebook, you should have a notebook to be executed automatically ready.
This notebook provides step-by-step instructions to create Azure Machine Learning Pipeline, publish it, and schedule to run the pipeline to execute the targeted notebook.

*** Please run the cells sequentially to avoid errors. Please do not use "run all cells". ***

Warm-up
Authentication to Azure Resources
Azure Machine Learning Pipleline

1. Warm-up

In [ ]:

# Azure Machine Learning and Pipeline SDK-specific imports
# azureml
import azureml.core
from azureml.core import Workspace, Experiment
from azureml.core.datastore import Datastore
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.contrib.notebook import NotebookRunConfig, AzureMLNotebookHandler
from azureml.pipeline.core import Pipeline
from azureml.pipeline.core import PipelineData
from azureml.contrib.notebook import NotebookRunnerStep
from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule

# azure common/core
from azure.common.credentials import get_azure_cli_credentials
from azure.mgmt.resource import ResourceManagementClient

# Python/ipython
import json
from datetime import datetime
from IPython.display import display, HTML, Markdown

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

In [ ]:

# Functions will be used in this notebook
def read_config_values(file_path):
    "This loads pre-generated parameters for Microsoft Sentinel Workspace"
    with open(file_path) as json_file:
        if json_file:
            json_config = json.load(json_file)
            return (json_config["tenant_id"],
                    json_config["subscription_id"],
                    json_config["resource_group"],
                    json_config["workspace_id"],
                    json_config["workspace_name"],
                    json_config["user_alias"],
                    json_config["user_object_id"])
    return None

def has_valid_token():
    "Check to see if there is a valid AAD token"
    try:
        credentials, sub_id = get_azure_cli_credentials()
        creds = credentials._get_cred(resource=None)
        token = creds._token_retriever()[2]
        print("Successfully signed in.")
        return True
    except Exception as ex:
        if "Please run 'az login' to setup account" in str(ex):
            print("Please sign in first.")
            return False
        elif "AADSTS70043: The refresh token has expired" in str(ex):
            message = "**The refresh token has expired. <br> Please continue your login process. Then: <br> 1. If you plan to run multiple notebooks on the same compute instance today, you may restart the compute instance by clicking 'Compute' on left menu, then select the instance, clicking 'Restart'; <br> 2. Otherwise, you may just restart the kernel from top menu. <br> Finally, close and re-load the notebook, then re-run cells one by one from the top.**"
            display(Markdown(message))
            return False
        elif "[Errno 2] No such file or directory: '/home/azureuser/.azure/azureProfile.json'" in str(ex):
            print("Please sign in.")
            return False
        else:
            print(str(ex))
            return False
    except:
        print("Please restart the kernel, and run 'az login'.")
        return False

In [ ]:

# Calling the above function to populate Microsoft Sentinel workspace parameters
# The file, config.json, was generated by the system, however, you may modify the values, or manually set the variables
tenant_id, subscription_id, resource_group, workspace_id, workspace_name, user_alias, user_object_id = read_config_values('config.json');
print("Subscription Id: " + subscription_id)

2. Authentication to Azure Resources

In [ ]:

# Azure CLI is used to get device code to login into Azure, you need to copy the code and open the DeviceLogin site.
# You may add [--tenant $tenant_id] to the command
if has_valid_token() == False:
    !echo -e '\e[42m'
    !az login --tenant $tenant_id --use-device-code

3. Azure Machine Learning Pipleline

In [ ]:

# 1. Enter resource names
# Enter name of an Azure resource group
resource_group = 'myresourcegroup'
# Enter current AML workspace name
current_aml_workspace_name = 'auto2022'
# Enter compute cluster name
amlcompute_cluster_name = 'compcl2022'

In [ ]:

# 2. Get AML workspace
ws = Workspace.get(name=current_aml_workspace_name, subscription_id=subscription_id, resource_group=resource_group)
print(ws)
ws.set_default_datastore("workspaceblobstore")
datastore  = Datastore.get(ws, "workspaceblobstore")

In [ ]:

# 3. Create a new RunConfig object
source_directory = ''
notebook_name = 'Automation Gallery - Credential Scan on Azure Blob Storage.ipynb'
output_notebook_name = 'blob_scan_results.ipynb'
conda_run_config = RunConfiguration(framework="python")
conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE
print('conda-run config is ready')

# Create notebook run configuration and set parameters values
handler = AzureMLNotebookHandler(timeout=600, progress_bar=False, log_output=True)
cfg = NotebookRunConfig(source_directory=source_directory, notebook=notebook_name,
                        handler = handler,
                        parameters={},
                        run_config=conda_run_config,
                        output_notebook=output_notebook_name)

print("Notebook Run Config is created.")

In [ ]:

# 4. Define NotebookRunnerStep
#my_pipeline_param = PipelineParameter(name="my_pipeline_param", default_value=datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
output_name = "notebookresult_2022"

output_from_notebook = PipelineData(name="notebook_processed_data", datastore=Datastore.get(ws, "workspaceblobstore"),output_overwrite=True, output_mode="upload")
notebook_runner_step = NotebookRunnerStep(name="sentinel_notebook_step",
                                          notebook_run_config=cfg,
                                          params = {},
                                          # params={"my_pipeline_param": my_pipeline_param},
                                          inputs=[],
                                          outputs=[], 
                                          allow_reuse=False,
                                          compute_target=amlcompute_cluster_name,
                                          output_notebook_pipeline_data_name=output_name)

print("Notebook Runner Step is Created.")

In [ ]:

# 5. Build Pipeline and publish it
pipeline4sentinel = Pipeline(workspace=ws, steps=[notebook_runner_step])
print("Pipeline creation complete")

# Publish the pipeline
timenow = datetime.now().strftime('%Y-%m-%d-%H-%M')
pipeline_name = "Sentinel-Pipeline-" + timenow

published_sentinel_pipeline = pipeline4sentinel.publish(
    name=pipeline_name, 
    description=pipeline_name)
print("Newly published pipeline id: {}".format(published_sentinel_pipeline.id))
print("Endpoint: {}".format(published_sentinel_pipeline.endpoint))

In [ ]:

# 6. Create a schedule for the published pipeline using a recurrence
schedule_name = 'sentinel_schedule'
experiment_name = 'sentinel_experiment_2022'
recurrence = ScheduleRecurrence(frequency="Day", interval=1, hours=[22], minutes=[30]) # Runs every other day at 10:30pm
#recurrence = ScheduleRecurrence(frequency="Hour", interval=8) # Runs every two hours 

schedule = Schedule.create(workspace=ws, name=schedule_name,
                           pipeline_id=published_sentinel_pipeline.id, 
                           experiment_name=experiment_name,
                           recurrence=recurrence,
                           wait_for_provisioning=True,
                           description="Schedule to run Sentinel notebook")

print("Created schedule with id: {}".format(schedule.id))

Automation Setup - Configure Azure Machine Learning Pipelines

Description

Table of Contents

1. Warm-up

2. Authentication to Azure Resources

3. Azure Machine Learning Pipleline

Product

Resources

Company