# Local: Tokenizing and Embedding
---
* Collaborators:
    * Roberto Rodriguez (@Cyb3rWard0g)
* References:
    * https://huggingface.co/docs/transformers/main/tokenizer_summary
    * https://huggingface.co/docs/transformers/model_doc/big_bird

## Basic Example

### Tokenizing with BigBird Model Tokenizer

In [72]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")

# Tokenize an event log
tokens = tokenizer.tokenize("Hello, my name is Roberto")

In [73]:
tokens

['▁Hello', ',', '▁my', '▁name', '▁is', '▁Roberto']

### Get Index IDs for the Tokens in the Vocabulary

In [74]:
import torch

# Convert tokens to input IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens,)

# Create a PyTorch tensor
input_ids = torch.tensor(input_ids).unsqueeze(0)  # Add a batch dimension

In [75]:
input_ids

tensor([[18536,   112,   717,  1539,   419, 32177]])

### Get Embeddings with BigBird Model

In [78]:
from transformers import BigBirdConfig, BigBirdModel

# Define a configuration with "original_full" attention
model_config = BigBirdConfig.from_pretrained("google/bigbird-roberta-base")
model_config.attention_type = "original_full"

model = BigBirdModel.from_pretrained("google/bigbird-roberta-base", config=model_config)

# Get BigBird Embeddings
with torch.no_grad():
    outputs = model(input_ids)

embeddings = outputs.last_hidden_state

In [100]:
print(f"Token: {tokens[0]}\nIndex ID: {input_ids[0][0]}\nEmbedding: {embeddings[0][0]} \n")

Token: ▁Hello
Index ID: 18536
Embedding: tensor([-1.1636e-01, -2.0380e-02, -6.1083e-02, -3.8545e-01,  5.7417e-01,
         1.0884e-01, -1.0427e-01,  9.3820e-02, -5.6838e-02, -3.1793e-01,
         6.8207e-02, -2.5721e-02,  6.7848e-02, -9.3928e-02,  1.1609e-01,
         1.4538e-01,  2.2018e-02, -5.3071e-02,  7.6467e-03,  4.7149e-02,
        -1.6713e-01,  1.7986e-01, -7.9160e-02, -1.1746e-02, -2.7829e-01,
         2.2447e-01, -2.6268e-02,  1.2537e-01,  2.0905e-02, -1.1724e-01,
         2.1016e-01,  2.1734e-01,  3.0937e-02,  2.7159e-01, -2.1522e-01,
         3.5828e-01,  1.3691e-01,  1.3992e-02,  2.4448e-02,  1.2941e-01,
         3.6535e-01,  2.9806e-01,  4.1460e-02,  4.4437e-02, -2.3589e-01,
         2.3958e-01,  1.0241e-01, -1.2554e-01,  1.6656e-02, -6.7489e-02,
        -1.3410e-01, -1.0509e-01,  1.7687e-01, -1.3179e-02, -7.9537e-02,
         1.3339e-01, -1.1378e-01,  4.0585e-02,  7.6636e-02, -4.1417e-01,
         2.2140e-01, -9.4216e-02, -3.1294e-01,  3.3909e-02, -5.8064e-02,
         5

In [99]:
# Define the number of elements to display from each embedding
num_display_elements = 5

# Display a back-to-back comparison with a subset of each embedding
for i in range(len(tokens)):
    subset_embedding = embeddings[0][i][:num_display_elements]
    print(f"Token: {tokens[i]}\nIndex ID: {input_ids[0][i]}\nEmbedding (subset): {subset_embedding}\n")

Token: ▁Hello
Index ID: 18536
Embedding (subset): tensor([-0.1164, -0.0204, -0.0611, -0.3855,  0.5742])

Token: ,
Index ID: 112
Embedding (subset): tensor([-0.1171, -0.0621, -0.0458, -0.3915,  0.6032])

Token: ▁my
Index ID: 717
Embedding (subset): tensor([ 0.1982,  0.1183,  0.1312, -0.3132,  0.4452])

Token: ▁name
Index ID: 1539
Embedding (subset): tensor([ 0.4725,  0.5227,  0.2076, -0.0177,  0.5755])

Token: ▁is
Index ID: 419
Embedding (subset): tensor([ 0.2397,  0.5524, -0.1053, -0.2849,  0.1071])

Token: ▁Roberto
Index ID: 32177
Embedding (subset): tensor([ 0.0671,  0.2040, -0.0366, -0.3737,  0.0768])



## Tokenizing and Embedding a Security Event Log

### Download Data Sample

In [56]:
import os

repository_url = 'https://github.com/OTRF/Security-Datasets/raw/master/datasets/compound/GoldenSAMLADFSMailAccess/Microsoft365DefenderEvents.Zip'
output_directory = './'  # Destination directory for unzipped files

# Extract the filename from the URL
file_name = repository_url.split('/')[-1]

# Check if the file already exists in the output directory
download_required = not os.path.exists(os.path.join(output_directory, file_name))

if download_required:
    import requests
    import zipfile
    import io

    # Download the zip file
    response = requests.get(repository_url)

    if response.status_code == 200:
        # Create the output directory if it doesn't exist
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)

        # Unzip the downloaded file to the output directory
        with zipfile.ZipFile(io.BytesIO(response.content), 'r') as zip_ref:
            zip_ref.extractall(output_directory)

        print(f"Downloaded and unzipped '{file_name}' to '{output_directory}'")
    else:
        print(f"Failed to download '{file_name}' from GitHub. Status code: {response.status_code}")
else:
    print(f"File '{file_name}' already exists in '{output_directory}', no need to download.")

File 'Microsoft365DefenderEvents.Zip' already exists in './', no need to download.


### Read Security Event Logs

In [102]:
import json

file_path = "Microsoft365DefenderEvents.json"

# Open and read the JSON file
with open(file_path, 'r') as file:
    data = file.read()

# Parse the JSON data
json_data = [json.loads(line) for line in data.splitlines()]

In [103]:
json_data[7]

{'Timestamp': '2021-08-02T13:32:07Z',
 'ActionType': 'MailItemsAccessed',
 'Application': 'Microsoft Exchange Online',
 'ApplicationId': 20893,
 'AccountObjectId': '5a95e683-08ad-424e-a441-1d1aec52c02c',
 'AccountDisplayName': 'SimuLandApp',
 'IsAdminOperation': 0,
 'DeviceType': 'Other',
 'OSPlatform': 'Unknown',
 'IPAddress': '1.2.3.4',
 'IsAnonymousProxy': 0,
 'CountryCode': 'US',
 'City': 'chicago',
 'ISP': 'Microsoft 365 Common and Office Online server',
 'UserAgent_dynamic': None,
 'UserAgent_string': 'Client=REST;;',
 'ActivityType': 'Run',
 'ActivityObjects': [{'ServiceObjectType': 'Session ID',
   'Type': 'Structured object',
   'Role': 'Parameter'},
  {'Type': 'Task', 'Role': 'Target object', 'Name': 'MailItemsAccessed'},
  {'Type': 'Property',
   'Role': 'Parameter',
   'Name': 'MailAccessType',
   'Value': 'Bind'},
  {'Type': 'Property',
   'Role': 'Parameter',
   'Name': 'IsThrottled',
   'Value': 'False'},
  {'ApplicationInstance': 0,
   'ApplicationId': 11161,
   'Type':

### Tokenize One Event Log

In [104]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")

# Tokenize an event log
tokens = tokenizer.tokenize(json.dumps(json_data[7]))

### Raw Example Log - Index 7

{"Timestamp":"2021-08-02T13:32:07Z","ActionType":"MailItemsAccessed","Application":"Microsoft Exchange Online","ApplicationId":20893,"AccountObjectId":"5a95e683-08ad-424e-a441-1d1aec52c02c","AccountDisplayName":"SimuLandApp","IsAdminOperation":0,"DeviceType":"Other","OSPlatform":"Unknown","IPAddress":"1.2.3.4","IsAnonymousProxy":0,"CountryCode":"US","City":"chicago","ISP":"Microsoft 365 Common and Office Online server","UserAgent_dynamic":null,"UserAgent_string":"Client=REST;;","ActivityType":"Run","ActivityObjects":[{"ServiceObjectType":"Session ID","Type":"Structured object","Role":"Parameter"},{"Type":"Task","Role":"Target object","Name":"MailItemsAccessed"},{"Type":"Property","Role":"Parameter","Name":"MailAccessType","Value":"Bind"},{"Type":"Property","Role":"Parameter","Name":"IsThrottled","Value":"False"},{"ApplicationInstance":0,"ApplicationId":11161,"Type":"User","Role":"Parameter","Name":"Gustavo Pedro","Id":"aead923d-498b-4f64-a66c-2af91447a8b6"},{"ApplicationInstance":0,"ApplicationId":11161,"Type":"Account","Role":"Actor","Name":"SimuLandApp","Id":"5a95e683-08ad-424e-a441-1d1aec52c02c"}],"ObjectName":"MailItemsAccessed","ObjectType":"Task","ObjectId":"","ReportId":"106830890_20893_699e0b10-1c53-403e-976f-ce0847a92b44","AdditionalFields":{"IsSatelliteProvider":false},"UserId":"","Permissions":null,"PermissionsAddedTo":"","RawEventData":{"OrganizationId":"00000000-0000-0000-0000-000000000000","CreationTime":"2021-08-02T13:32:07.0000000Z","RecordType":50,"Operation":"MailItemsAccessed","UserType":0,"Workload":"Exchange","Version":1,"UserKey":"100320015858B802","UserId":"pgustavo@simulandlabs.com","OriginatingServer":"AB1CD23EF4567 (15.20.4200.000)\r\n","InternalLogonType":0,"OrganizationName":"simulandlabs.onmicrosoft.com","ClientInfoString":"Client=REST;;","MailboxOwnerSid":"S-1-5-21-1825954961-3338807533-2873504967-26087451","ClientIPAddress":"1.2.3.4","MailboxOwnerUPN":"pgustavo@simulandlabs.com","ExternalAccess":false,"ResultStatus":"Succeeded","Id":"699e0b10-1c53-403e-976f-ce0847a92b44","LogonUserSid":"S-1-5-21-1825954961-3338807533-2873504967-26087451","MailboxGuid":"d0c5f8ae-9ed7-4e46-bfdf-ea1460f5a31b","LogonType":0,"OperationProperties":["@{Value=Bind; Name=MailAccessType}","@{Value=False; Name=IsThrottled}"],"OperationCount":7,"AppId":"00000003-0000-0000-c000-000000000000","Folders":["@{Id=LgAAAAAM7KyTTmWeRac2KXBEz/7aAQARGHK+grzLTpRJraC1QR6kAAAAAAEMAAAB; Path=\\Inbox; FolderItems=System.Object[]}"],"ClientAppId":"5a95e683-08ad-424e-a441-1d1aec52c02c"},"spnID":"","rawData":{"OrganizationId":"00000000-0000-0000-0000-000000000000","CreationTime":"2021-08-02T13:32:07.0000000Z","RecordType":50,"Operation":"MailItemsAccessed","UserType":0,"Workload":"Exchange","Version":1,"UserKey":"100320015858B802","UserId":"pgustavo@simulandlabs.com","OriginatingServer":"AB1CD23EF4567 (15.20.4200.000)\r\n","InternalLogonType":0,"OrganizationName":"simulandlabs.onmicrosoft.com","ClientInfoString":"Client=REST;;","MailboxOwnerSid":"S-1-5-21-1825954961-3338807533-2873504967-26087451","ClientIPAddress":"1.2.3.4","MailboxOwnerUPN":"pgustavo@simulandlabs.com","ExternalAccess":false,"ResultStatus":"Succeeded","Id":"699e0b10-1c53-403e-976f-ce0847a92b44","LogonUserSid":"S-1-5-21-1825954961-3338807533-2873504967-26087451","MailboxGuid":"d0c5f8ae-9ed7-4e46-bfdf-ea1460f5a31b","LogonType":0,"OperationProperties":["@{Value=Bind; Name=MailAccessType}","@{Value=False; Name=IsThrottled}"],"OperationCount":7,"AppId":"00000003-0000-0000-c000-000000000000","Folders":["@{Id=LgAAAAAM7KyTTmWeRac2KXBEz/7aAQARGHK+grzLTpRJraC1QR6kAAAAAAEMAAAB; Path=\\Inbox; FolderItems=System.Object[]}"],"ClientAppId":"5a95e683-08ad-424e-a441-1d1aec52c02c"},"AppId":"00000003-0000-0000-c000-000000000000","OAuthAppId":"5a95e683-08ad-424e-a441-1d1aec52c02c","TargetAccountUpn":"","TargetAccountDisplayName":"","TargetDeviceName":"","DestinationDeviceName":"","DestinationIPAddress":"","DestinationPort":null,"Protocol":"","AccountName":"","AccountDomain":"","AccountUpn":"","AccountSid":"","DeviceName":"","Port":null,"Location":""}

In [111]:
tokens[:10]

['▁{"', 'Tim', 'estamp', '":', '▁"', '2', '021', '-', '08', '-']

In [113]:
tokens[20:30]

['Action', 'Type', '":', '▁"', 'Mail', 'Items', 'Acc', 'essed', '",', '▁"']

In [106]:
import torch

# Convert tokens to input IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)

# Create a PyTorch tensor
input_ids = torch.tensor(input_ids).unsqueeze(0)  # Add a batch dimension

## Generate Embeddings for One Event Log

In [107]:
from transformers import BigBirdModel

model = BigBirdModel.from_pretrained("google/bigbird-roberta-base")

# Get BigBird Embeddings
with torch.no_grad():
    outputs = model(input_ids)

embeddings = outputs.last_hidden_state

In [108]:
embeddings

tensor([[[ 0.1869,  0.0935, -0.0685,  ..., -0.0211, -0.1009, -0.0862],
         [ 0.0773, -0.1688, -0.1072,  ...,  0.1626,  0.1574,  0.1664],
         [ 0.1790, -0.0747,  0.0009,  ...,  0.2164,  0.1874,  0.1451],
         ...,
         [ 0.1302, -0.0763, -0.1211,  ...,  0.0049, -0.3677, -0.0096],
         [ 0.3301, -0.5417, -0.3299,  ..., -0.2153, -0.4639, -0.1000],
         [ 0.1222, -0.2770, -0.0157,  ...,  0.0253, -0.0452,  0.0553]]])

In [109]:
len(embeddings[0])

1708