Local: Tokenizing and Embedding#


Basic Example#

Tokenizing with BigBird Model Tokenizer#

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")

# Tokenize an event log
tokens = tokenizer.tokenize("Hello, my name is Roberto")
tokens
['▁Hello', ',', '▁my', '▁name', '▁is', '▁Roberto']

Get Index IDs for the Tokens in the Vocabulary#

import torch

# Convert tokens to input IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens,)

# Create a PyTorch tensor
input_ids = torch.tensor(input_ids).unsqueeze(0)  # Add a batch dimension
input_ids
tensor([[18536,   112,   717,  1539,   419, 32177]])

Get Embeddings with BigBird Model#

from transformers import BigBirdConfig, BigBirdModel

# Define a configuration with "original_full" attention
model_config = BigBirdConfig.from_pretrained("google/bigbird-roberta-base")
model_config.attention_type = "original_full"

model = BigBirdModel.from_pretrained("google/bigbird-roberta-base", config=model_config)

# Get BigBird Embeddings
with torch.no_grad():
    outputs = model(input_ids)

embeddings = outputs.last_hidden_state
print(f"Token: {tokens[0]}\nIndex ID: {input_ids[0][0]}\nEmbedding: {embeddings[0][0]} \n")
Token: ▁Hello
Index ID: 18536
Embedding: tensor([-1.1636e-01, -2.0380e-02, -6.1083e-02, -3.8545e-01,  5.7417e-01,
         1.0884e-01, -1.0427e-01,  9.3820e-02, -5.6838e-02, -3.1793e-01,
         6.8207e-02, -2.5721e-02,  6.7848e-02, -9.3928e-02,  1.1609e-01,
         1.4538e-01,  2.2018e-02, -5.3071e-02,  7.6467e-03,  4.7149e-02,
        -1.6713e-01,  1.7986e-01, -7.9160e-02, -1.1746e-02, -2.7829e-01,
         2.2447e-01, -2.6268e-02,  1.2537e-01,  2.0905e-02, -1.1724e-01,
         2.1016e-01,  2.1734e-01,  3.0937e-02,  2.7159e-01, -2.1522e-01,
         3.5828e-01,  1.3691e-01,  1.3992e-02,  2.4448e-02,  1.2941e-01,
         3.6535e-01,  2.9806e-01,  4.1460e-02,  4.4437e-02, -2.3589e-01,
         2.3958e-01,  1.0241e-01, -1.2554e-01,  1.6656e-02, -6.7489e-02,
        -1.3410e-01, -1.0509e-01,  1.7687e-01, -1.3179e-02, -7.9537e-02,
         1.3339e-01, -1.1378e-01,  4.0585e-02,  7.6636e-02, -4.1417e-01,
         2.2140e-01, -9.4216e-02, -3.1294e-01,  3.3909e-02, -5.8064e-02,
         5.5965e-03,  1.0053e-01,  7.6107e-02, -1.6494e-01,  1.0267e-01,
        -8.9715e-02, -2.3077e-01, -5.9641e-04, -1.6648e-01,  1.3973e-01,
         6.6399e-02,  5.1237e-02, -6.1343e-01,  7.7089e-02,  1.9489e-01,
         1.6679e-01, -3.6878e-01,  4.0401e-01,  4.3976e-02,  7.0080e-02,
        -1.1072e-01, -3.1902e-01,  5.7557e-02,  1.5039e-01,  3.7570e-01,
        -6.5676e-03, -1.8109e-01, -2.6733e-02,  6.4456e-02, -1.5362e-02,
        -3.9735e-01, -1.7642e-01, -8.8574e-02,  8.2169e-02, -2.3364e-01,
        -9.7785e-03,  5.4608e-02,  6.5257e-02,  1.3635e-01,  3.4155e-01,
         7.6657e-02, -3.1519e-01, -2.6139e-01, -2.7790e-03,  8.6237e-02,
        -2.3414e-02, -1.5799e-02,  9.2869e-02,  1.5318e-01, -2.2860e-01,
         7.2682e-02, -1.6873e-01, -3.0805e-01,  5.8212e-03,  3.4121e-02,
        -5.9611e-02, -2.1086e-01, -3.6269e-02,  1.1456e-01,  7.7441e-02,
         4.0523e-02,  1.2661e-01,  2.6328e-01,  1.1657e-01, -5.4348e-03,
        -6.1856e-02, -1.2447e-01,  2.9078e-01,  4.8988e-02,  2.3876e-02,
        -9.2600e-02,  2.2589e-01, -7.1960e-03, -2.0627e-01, -1.8147e-02,
         5.5104e-02,  1.0189e-01,  1.9438e-01,  4.5833e-02, -9.2392e-03,
        -1.6446e-01, -2.5202e-01, -3.1434e-01,  4.2671e-02,  1.3502e-01,
         9.6235e-02,  3.9788e-02, -1.8269e-01, -3.1492e-01, -6.6831e-02,
         1.1559e+00, -2.9708e-02,  1.5390e-01,  1.1358e-01,  9.6380e-02,
         2.5466e-01,  1.4219e-01,  1.7253e-01, -6.7271e-02, -8.9888e-02,
         8.1218e-02, -2.2050e-01,  1.5366e-01, -2.7897e-01, -8.3203e-02,
        -1.1591e-01,  2.7777e-01, -1.3716e-01, -2.6728e-01,  1.5150e-02,
        -6.6391e-02,  2.4254e-01,  7.7544e-02, -1.4343e-01, -1.4653e-01,
        -8.9712e-02,  5.6309e-02,  3.3134e-01, -2.8285e-01,  1.8264e-01,
         3.4600e-01,  4.7534e-01,  3.5328e-02, -7.8802e-02,  1.7221e-01,
         1.3720e-01,  1.1759e-01, -8.6466e-02, -1.2543e-02, -6.5371e-02,
         1.3755e-01, -2.9762e-02, -2.4923e-01,  1.5668e-01, -9.0752e-02,
         8.2167e-03, -1.6505e-02,  1.6438e-01,  1.8271e-01,  2.1629e-01,
        -2.0046e-01,  2.3587e-01,  8.4096e-02, -8.9142e-02,  2.1799e-01,
        -4.7325e-02, -1.8031e-01, -1.3593e-01,  1.1050e-01,  2.8516e-02,
        -7.0048e-02,  2.3932e-01, -5.8864e-02,  4.4210e-02, -1.9005e-02,
         4.1998e-01, -1.2285e-01,  1.4732e-01,  1.8190e-01,  1.8617e-01,
         1.2658e-01,  2.3555e-01, -1.0678e-01,  2.9762e-01,  1.0807e-01,
        -1.3972e-01,  1.0254e-01,  2.8220e-02, -1.0848e-03, -2.9666e-02,
        -1.1325e-01, -1.6394e-01, -1.0015e-01, -2.6119e-01, -1.5041e-01,
         3.7000e-02,  1.3124e-01, -1.1700e-01,  1.5929e-01,  7.0666e-02,
        -1.4327e-01, -4.5101e-02,  7.7282e-02, -9.3982e-02,  1.0810e-01,
        -7.8679e-02, -4.2005e-01,  6.4883e-02, -3.6853e-01,  1.3792e-01,
         9.9273e-02,  7.7822e-02,  8.1490e-03, -2.6939e-01, -2.5332e-01,
        -1.5344e-01,  3.5467e-01, -7.4576e-02,  1.4070e-01, -1.6303e-01,
        -4.5512e-01,  3.8096e-03,  1.4180e-01,  2.1611e-01, -1.3778e-01,
         1.3915e-02,  8.8185e-02,  8.5387e-02, -1.0825e-02, -1.1628e-01,
         1.5133e-01,  5.1713e-02, -3.6666e-01,  8.9856e-03,  6.1384e-02,
         3.2566e-01,  1.5698e-01,  4.5102e-02, -9.1748e-02,  6.1022e-03,
         6.7989e-02, -1.7882e-01, -4.2545e-01,  3.3579e-02,  2.6070e-01,
         1.9517e-01,  1.6657e-01,  5.0921e-02, -4.0859e-02, -2.3622e-01,
         1.1225e-01,  1.2212e-01, -3.4605e-02,  8.1477e-02, -1.2558e-01,
         1.1156e-01,  3.6761e-03, -1.3694e-01, -2.2334e-01,  1.1985e-01,
        -2.6214e-02,  4.3161e-02, -1.5386e-01,  9.9220e-02, -5.5973e-02,
        -1.9602e-01,  1.4341e-01,  1.0897e-01,  3.7060e-01, -2.3487e-02,
        -1.0029e-01,  1.0519e-01,  2.2279e-01,  4.9273e-02, -8.8852e-02,
        -1.4109e-01,  3.5998e-03,  8.2714e-02, -7.5794e-02,  2.3602e-03,
         1.1261e-01,  4.3421e-02,  1.7652e-01, -1.0864e-01,  8.2341e-01,
        -3.4869e-03,  1.0346e-03,  3.5852e-01,  2.1665e-01, -2.8769e-01,
         7.5744e-02,  1.3908e-01,  1.1003e-01,  4.5481e-01,  1.9171e-01,
         1.4931e-01,  1.4257e-01, -2.3037e-01,  1.3965e-01, -1.8462e-02,
         1.0343e-01, -8.1092e-02, -1.8032e-01, -1.8582e-02,  4.3680e-01,
        -6.0520e-02, -1.1731e-02,  6.8673e-02, -1.7749e-02,  6.9948e-02,
         4.6084e-02, -1.1374e-01,  2.1613e-01,  1.1426e-01, -4.4291e-01,
         2.6911e-01,  2.8104e-01, -1.6549e-01,  1.0510e-01, -3.8595e-01,
        -1.2112e-01,  3.1570e-01,  3.2223e-02,  2.3265e-03,  2.2583e-01,
         2.2892e-01, -3.5669e-02,  5.5374e-02, -4.6082e-01, -6.9463e-02,
        -3.1969e-01,  1.4195e-01, -1.0212e-01,  2.4094e-01, -6.6254e-02,
        -6.5807e-02, -9.6821e-02,  2.3727e-01, -1.5682e-01, -1.8467e-01,
        -2.6433e-01, -1.2356e-01,  2.8136e-02, -3.8417e-02, -2.2588e-01,
         8.7069e-02,  1.5041e-01, -1.1330e-02,  2.0777e-01, -5.2667e-02,
        -3.5839e-01, -2.4291e-01, -3.8074e-02,  2.3650e-01, -6.6348e-02,
         3.6094e-02,  2.4189e-01,  1.5915e-01,  2.1828e-01, -2.4702e-01,
         2.0885e-01,  3.0356e-01, -8.4988e-02, -2.0408e-01,  4.1214e-02,
         2.7640e-02,  1.0529e-01, -1.0015e-01, -7.5593e-02,  2.5404e-01,
        -3.8558e-02, -9.3942e-02,  9.6083e-02, -6.8022e-02, -1.6198e-01,
        -2.6486e-02,  2.4777e-01,  2.6740e-01,  1.5189e-01,  1.9176e-01,
         3.3901e-02,  2.1967e-01, -4.1814e-01, -1.1531e-02, -2.7577e-01,
         3.1706e-01,  2.2221e-01,  1.5919e-01, -1.2716e-01,  7.8425e-02,
        -2.4036e-01, -2.4433e-01, -1.0049e-01, -1.7044e-01,  4.4139e-02,
        -1.4504e-01, -7.0745e-02, -2.1589e-01, -1.0893e-01,  5.3016e-01,
         6.8166e-02, -1.3483e-01,  1.2181e-01,  3.1380e-01, -2.5864e-01,
         2.6604e-01,  8.9654e-03, -7.6018e-02, -1.3141e+00, -1.1336e-01,
        -6.3815e-02,  1.9731e-01,  2.5545e-01, -5.8033e-02,  1.5419e-01,
        -1.0425e-01, -9.3440e-02, -9.6354e-02,  5.3156e-02, -2.6236e-01,
        -2.7334e-01,  5.4305e-02,  2.1077e-01, -1.5582e-01,  1.1502e-01,
        -7.2804e-02, -1.2972e-01, -3.9262e-02, -1.7156e-01,  2.6468e-01,
        -1.1975e-02,  2.6765e-01,  2.8969e-01, -1.2662e-01, -3.0494e-01,
         3.2054e-02,  1.3120e-01, -5.3662e-02, -1.3033e-01, -3.3487e-02,
         6.6659e-02, -1.2525e-01,  6.1176e-02,  3.5805e-02,  7.7177e-02,
         3.6847e-02, -7.3659e-02,  7.1801e-03,  2.2346e-01, -3.6041e-01,
         3.2930e-01, -3.6310e-01,  2.0905e-01,  1.1914e-01,  1.8711e-02,
         3.6424e-02, -2.1259e-01,  1.7280e-01, -1.8592e-01, -9.4130e-03,
         1.7409e-01, -1.9176e-01,  1.5465e-01,  5.4870e-02, -1.1879e-01,
        -1.3106e-01, -7.0900e-02,  4.3117e-02,  9.8918e-03,  1.8036e-02,
        -8.9139e-02,  3.7314e-02, -4.4075e-02, -1.4569e-01,  9.1170e-02,
         1.4312e-01, -5.3165e-02,  1.1121e-03,  2.0978e-01, -9.8794e-03,
         4.2158e-02,  2.7332e-01,  2.5068e-01,  2.3489e-01, -2.4546e-01,
         4.3379e-01, -2.2498e-01, -3.6471e-02,  2.6790e-01, -1.5893e-01,
        -7.0857e-02,  1.0994e-01,  1.4304e-01, -2.0314e-01,  1.5672e-01,
        -1.0299e-01, -1.2334e-01,  9.4673e-02, -4.7860e-02,  1.8639e-01,
        -1.8374e-01, -9.9229e-02,  1.9476e-01,  2.6587e-01, -1.1859e-02,
        -7.1661e-02,  8.5858e-01, -1.4662e-01, -1.3256e-01,  9.0016e-02,
         2.8409e-01, -9.3416e-02,  1.3053e-01,  1.9137e-02,  1.4655e-01,
        -8.9044e-02,  6.3552e-02,  5.1784e-03, -1.0393e-01,  6.8222e-03,
        -1.0571e-01, -3.6165e-02,  1.3839e-01, -6.1466e-02,  4.1807e-02,
         1.6402e-01,  5.7555e-01,  4.0541e-02, -1.9156e-01,  2.6890e-01,
         8.7065e-02,  3.4070e-01,  5.1472e-02, -1.1112e-01,  3.1804e-01,
        -1.2821e-01, -2.1103e-01,  1.1119e-01, -2.2352e-01, -1.6754e-01,
         1.2009e-01,  7.7462e-02,  3.1534e-02,  1.4148e+01,  6.7538e-02,
         8.4337e-03,  1.8661e-01, -2.0979e-01,  1.2481e-01, -1.5175e-02,
        -3.3540e-02, -1.1672e-01,  1.3878e-01,  2.7092e-02, -1.8306e-01,
        -5.7783e-02, -2.4391e-01,  9.1538e-02, -8.8047e-02,  4.8807e-02,
        -5.5185e-02,  1.1760e-01, -1.5016e-03,  2.8545e-01,  3.0705e-02,
        -2.0540e-01,  5.9746e-02, -3.2546e-04,  3.3889e-01,  2.3636e-01,
        -1.1071e-01, -1.3425e-01, -9.5960e-02, -2.3532e-01,  2.0827e-01,
         1.0364e-01,  1.1353e-01,  2.0688e-01, -2.8800e-01,  3.4962e-01,
         2.0934e-01,  1.4722e-01,  5.1247e-02,  7.2226e-02, -1.4812e-01,
         4.0343e-02,  3.2904e-01,  2.2009e-01,  2.2765e-01,  1.7634e-01,
         7.4393e-02,  2.7124e-02, -8.8036e-02,  2.7232e-02,  1.2098e-01,
         2.1667e-01, -1.3163e-01, -1.9002e-01,  6.3883e-02,  3.7940e-02,
         7.7639e-02, -3.2495e-02, -1.4721e-01,  2.3456e-03,  1.0726e-01,
         2.6602e-01,  2.9699e-02, -1.0600e-01, -1.7380e-01,  4.2345e-01,
         2.1489e-01,  1.7938e-02,  1.0345e-01,  9.4192e-02,  1.3841e-01,
        -9.0320e-02,  8.9497e-02,  4.8507e-02, -4.2748e-01, -1.4755e-01,
        -4.5336e-02, -2.6415e-01,  1.5165e-01,  1.2128e-02, -8.0591e-03,
        -1.2446e-01, -1.8867e-01, -6.1890e-02,  3.0580e-01, -1.1294e-01,
         2.5704e-02, -2.0962e-02, -2.4601e-01, -1.3209e-01,  2.7346e-01,
         2.3948e-02, -2.1149e-01, -1.3995e-01, -1.1723e-01,  2.8653e-02,
        -9.8611e-02,  3.7482e-01, -1.0707e-01, -1.4909e-01, -1.2980e-02,
        -2.7651e-01,  2.6002e-01, -2.2535e-01,  9.1586e-02, -1.0252e-01,
         2.9707e-01,  1.0359e-01,  9.4246e-02, -1.4140e-01, -2.0433e-01,
         1.3755e-01,  8.4514e-02, -3.8167e-02, -1.6627e-01, -2.6318e-01,
        -1.7245e-01,  1.7124e-01, -2.2904e-01,  1.6619e-01,  8.5417e-02,
        -2.1021e-01, -8.6555e-02, -8.8081e-02,  7.7535e-02, -4.9845e-02,
        -1.2976e-01, -3.9131e-02, -1.0433e-03, -3.5251e-02,  2.2621e-02,
         1.2982e-01, -1.1672e-01,  8.4644e-02,  8.5460e-03, -5.6842e-02,
        -1.2801e-01,  5.1929e-01,  4.3736e-01,  8.5156e-02, -1.4027e-01,
         1.6127e-01, -3.8899e-01, -3.6612e-01, -5.7083e-02, -2.0425e-02,
        -5.9496e-02,  6.1161e-02, -3.4648e-01,  7.1157e-02, -7.4365e-02,
         1.7982e-02,  3.6682e-02, -5.8530e-02,  2.4198e-01, -9.8698e-04,
        -4.1814e-02,  1.2658e-01, -2.7362e-02,  1.7819e-01,  4.7886e-02,
         4.8654e-01, -1.4443e-02,  1.1563e-01, -2.4416e-01,  2.1088e-02,
        -1.7257e-01, -4.1795e-02, -5.8262e-02,  1.0879e-01, -3.8052e-02,
         1.2507e-01,  5.6900e-02,  3.1752e-02, -1.3258e-02,  2.6444e-01,
        -1.6823e-01, -9.8095e-03, -3.2275e-01]) 
# Define the number of elements to display from each embedding
num_display_elements = 5

# Display a back-to-back comparison with a subset of each embedding
for i in range(len(tokens)):
    subset_embedding = embeddings[0][i][:num_display_elements]
    print(f"Token: {tokens[i]}\nIndex ID: {input_ids[0][i]}\nEmbedding (subset): {subset_embedding}\n")
Token: ▁Hello
Index ID: 18536
Embedding (subset): tensor([-0.1164, -0.0204, -0.0611, -0.3855,  0.5742])

Token: ,
Index ID: 112
Embedding (subset): tensor([-0.1171, -0.0621, -0.0458, -0.3915,  0.6032])

Token: ▁my
Index ID: 717
Embedding (subset): tensor([ 0.1982,  0.1183,  0.1312, -0.3132,  0.4452])

Token: ▁name
Index ID: 1539
Embedding (subset): tensor([ 0.4725,  0.5227,  0.2076, -0.0177,  0.5755])

Token: ▁is
Index ID: 419
Embedding (subset): tensor([ 0.2397,  0.5524, -0.1053, -0.2849,  0.1071])

Token: ▁Roberto
Index ID: 32177
Embedding (subset): tensor([ 0.0671,  0.2040, -0.0366, -0.3737,  0.0768])

Tokenizing and Embedding a Security Event Log#

Download Data Sample#

import os

repository_url = 'https://github.com/OTRF/Security-Datasets/raw/master/datasets/compound/GoldenSAMLADFSMailAccess/Microsoft365DefenderEvents.Zip'
output_directory = './'  # Destination directory for unzipped files

# Extract the filename from the URL
file_name = repository_url.split('/')[-1]

# Check if the file already exists in the output directory
download_required = not os.path.exists(os.path.join(output_directory, file_name))

if download_required:
    import requests
    import zipfile
    import io

    # Download the zip file
    response = requests.get(repository_url)

    if response.status_code == 200:
        # Create the output directory if it doesn't exist
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)

        # Unzip the downloaded file to the output directory
        with zipfile.ZipFile(io.BytesIO(response.content), 'r') as zip_ref:
            zip_ref.extractall(output_directory)

        print(f"Downloaded and unzipped '{file_name}' to '{output_directory}'")
    else:
        print(f"Failed to download '{file_name}' from GitHub. Status code: {response.status_code}")
else:
    print(f"File '{file_name}' already exists in '{output_directory}', no need to download.")
File 'Microsoft365DefenderEvents.Zip' already exists in './', no need to download.

Read Security Event Logs#

import json

file_path = "Microsoft365DefenderEvents.json"

# Open and read the JSON file
with open(file_path, 'r') as file:
    data = file.read()

# Parse the JSON data
json_data = [json.loads(line) for line in data.splitlines()]
json_data[7]
{'Timestamp': '2021-08-02T13:32:07Z',
 'ActionType': 'MailItemsAccessed',
 'Application': 'Microsoft Exchange Online',
 'ApplicationId': 20893,
 'AccountObjectId': '5a95e683-08ad-424e-a441-1d1aec52c02c',
 'AccountDisplayName': 'SimuLandApp',
 'IsAdminOperation': 0,
 'DeviceType': 'Other',
 'OSPlatform': 'Unknown',
 'IPAddress': '1.2.3.4',
 'IsAnonymousProxy': 0,
 'CountryCode': 'US',
 'City': 'chicago',
 'ISP': 'Microsoft 365 Common and Office Online server',
 'UserAgent_dynamic': None,
 'UserAgent_string': 'Client=REST;;',
 'ActivityType': 'Run',
 'ActivityObjects': [{'ServiceObjectType': 'Session ID',
   'Type': 'Structured object',
   'Role': 'Parameter'},
  {'Type': 'Task', 'Role': 'Target object', 'Name': 'MailItemsAccessed'},
  {'Type': 'Property',
   'Role': 'Parameter',
   'Name': 'MailAccessType',
   'Value': 'Bind'},
  {'Type': 'Property',
   'Role': 'Parameter',
   'Name': 'IsThrottled',
   'Value': 'False'},
  {'ApplicationInstance': 0,
   'ApplicationId': 11161,
   'Type': 'User',
   'Role': 'Parameter',
   'Name': 'Gustavo Pedro',
   'Id': 'aead923d-498b-4f64-a66c-2af91447a8b6'},
  {'ApplicationInstance': 0,
   'ApplicationId': 11161,
   'Type': 'Account',
   'Role': 'Actor',
   'Name': 'SimuLandApp',
   'Id': '5a95e683-08ad-424e-a441-1d1aec52c02c'}],
 'ObjectName': 'MailItemsAccessed',
 'ObjectType': 'Task',
 'ObjectId': '',
 'ReportId': '106830890_20893_699e0b10-1c53-403e-976f-ce0847a92b44',
 'AdditionalFields': {'IsSatelliteProvider': False},
 'UserId': '',
 'Permissions': None,
 'PermissionsAddedTo': '',
 'RawEventData': {'OrganizationId': '00000000-0000-0000-0000-000000000000',
  'CreationTime': '2021-08-02T13:32:07.0000000Z',
  'RecordType': 50,
  'Operation': 'MailItemsAccessed',
  'UserType': 0,
  'Workload': 'Exchange',
  'Version': 1,
  'UserKey': '100320015858B802',
  'UserId': 'pgustavo@simulandlabs.com',
  'OriginatingServer': 'AB1CD23EF4567 (15.20.4200.000)\r\n',
  'InternalLogonType': 0,
  'OrganizationName': 'simulandlabs.onmicrosoft.com',
  'ClientInfoString': 'Client=REST;;',
  'MailboxOwnerSid': 'S-1-5-21-1825954961-3338807533-2873504967-26087451',
  'ClientIPAddress': '1.2.3.4',
  'MailboxOwnerUPN': 'pgustavo@simulandlabs.com',
  'ExternalAccess': False,
  'ResultStatus': 'Succeeded',
  'Id': '699e0b10-1c53-403e-976f-ce0847a92b44',
  'LogonUserSid': 'S-1-5-21-1825954961-3338807533-2873504967-26087451',
  'MailboxGuid': 'd0c5f8ae-9ed7-4e46-bfdf-ea1460f5a31b',
  'LogonType': 0,
  'OperationProperties': ['@{Value=Bind; Name=MailAccessType}',
   '@{Value=False; Name=IsThrottled}'],
  'OperationCount': 7,
  'AppId': '00000003-0000-0000-c000-000000000000',
  'Folders': ['@{Id=LgAAAAAM7KyTTmWeRac2KXBEz/7aAQARGHK+grzLTpRJraC1QR6kAAAAAAEMAAAB; Path=\\Inbox; FolderItems=System.Object[]}'],
  'ClientAppId': '5a95e683-08ad-424e-a441-1d1aec52c02c'},
 'spnID': '',
 'rawData': {'OrganizationId': '00000000-0000-0000-0000-000000000000',
  'CreationTime': '2021-08-02T13:32:07.0000000Z',
  'RecordType': 50,
  'Operation': 'MailItemsAccessed',
  'UserType': 0,
  'Workload': 'Exchange',
  'Version': 1,
  'UserKey': '100320015858B802',
  'UserId': 'pgustavo@simulandlabs.com',
  'OriginatingServer': 'AB1CD23EF4567 (15.20.4200.000)\r\n',
  'InternalLogonType': 0,
  'OrganizationName': 'simulandlabs.onmicrosoft.com',
  'ClientInfoString': 'Client=REST;;',
  'MailboxOwnerSid': 'S-1-5-21-1825954961-3338807533-2873504967-26087451',
  'ClientIPAddress': '1.2.3.4',
  'MailboxOwnerUPN': 'pgustavo@simulandlabs.com',
  'ExternalAccess': False,
  'ResultStatus': 'Succeeded',
  'Id': '699e0b10-1c53-403e-976f-ce0847a92b44',
  'LogonUserSid': 'S-1-5-21-1825954961-3338807533-2873504967-26087451',
  'MailboxGuid': 'd0c5f8ae-9ed7-4e46-bfdf-ea1460f5a31b',
  'LogonType': 0,
  'OperationProperties': ['@{Value=Bind; Name=MailAccessType}',
   '@{Value=False; Name=IsThrottled}'],
  'OperationCount': 7,
  'AppId': '00000003-0000-0000-c000-000000000000',
  'Folders': ['@{Id=LgAAAAAM7KyTTmWeRac2KXBEz/7aAQARGHK+grzLTpRJraC1QR6kAAAAAAEMAAAB; Path=\\Inbox; FolderItems=System.Object[]}'],
  'ClientAppId': '5a95e683-08ad-424e-a441-1d1aec52c02c'},
 'AppId': '00000003-0000-0000-c000-000000000000',
 'OAuthAppId': '5a95e683-08ad-424e-a441-1d1aec52c02c',
 'TargetAccountUpn': '',
 'TargetAccountDisplayName': '',
 'TargetDeviceName': '',
 'DestinationDeviceName': '',
 'DestinationIPAddress': '',
 'DestinationPort': None,
 'Protocol': '',
 'AccountName': '',
 'AccountDomain': '',
 'AccountUpn': '',
 'AccountSid': '',
 'DeviceName': '',
 'Port': None,
 'Location': ''}

Tokenize One Event Log#

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")

# Tokenize an event log
tokens = tokenizer.tokenize(json.dumps(json_data[7]))

Raw Example Log - Index 7#

{“Timestamp”:”2021-08-02T13:32:07Z”,”ActionType”:”MailItemsAccessed”,”Application”:”Microsoft Exchange Online”,”ApplicationId”:20893,”AccountObjectId”:”5a95e683-08ad-424e-a441-1d1aec52c02c”,”AccountDisplayName”:”SimuLandApp”,”IsAdminOperation”:0,”DeviceType”:”Other”,”OSPlatform”:”Unknown”,”IPAddress”:”1.2.3.4”,”IsAnonymousProxy”:0,”CountryCode”:”US”,”City”:”chicago”,”ISP”:”Microsoft 365 Common and Office Online server”,”UserAgent_dynamic”:null,”UserAgent_string”:”Client=REST;;”,”ActivityType”:”Run”,”ActivityObjects”:[{“ServiceObjectType”:”Session ID”,”Type”:”Structured object”,”Role”:”Parameter”},{“Type”:”Task”,”Role”:”Target object”,”Name”:”MailItemsAccessed”},{“Type”:”Property”,”Role”:”Parameter”,”Name”:”MailAccessType”,”Value”:”Bind”},{“Type”:”Property”,”Role”:”Parameter”,”Name”:”IsThrottled”,”Value”:”False”},{“ApplicationInstance”:0,”ApplicationId”:11161,”Type”:”User”,”Role”:”Parameter”,”Name”:”Gustavo Pedro”,”Id”:”aead923d-498b-4f64-a66c-2af91447a8b6”},{“ApplicationInstance”:0,”ApplicationId”:11161,”Type”:”Account”,”Role”:”Actor”,”Name”:”SimuLandApp”,”Id”:”5a95e683-08ad-424e-a441-1d1aec52c02c”}],”ObjectName”:”MailItemsAccessed”,”ObjectType”:”Task”,”ObjectId”:””,”ReportId”:”106830890_20893_699e0b10-1c53-403e-976f-ce0847a92b44”,”AdditionalFields”:{“IsSatelliteProvider”:false},”UserId”:””,”Permissions”:null,”PermissionsAddedTo”:””,”RawEventData”:{”OrganizationId”:”00000000-0000-0000-0000-000000000000”,”CreationTime”:”2021-08-02T13:32:07.0000000Z”,”RecordType”:50,”Operation”:”MailItemsAccessed”,”UserType”:0,”Workload”:”Exchange”,”Version”:1,”UserKey”:”100320015858B802”,”UserId”:”pgustavo@simulandlabs.com”,”OriginatingServer”:”AB1CD23EF4567 (15.20.4200.000)\r\n”,”InternalLogonType”:0,”OrganizationName”:”simulandlabs.onmicrosoft.com”,”ClientInfoString”:”Client=REST;;”,”MailboxOwnerSid”:”S-1-5-21-1825954961-3338807533-2873504967-26087451”,”ClientIPAddress”:”1.2.3.4”,”MailboxOwnerUPN”:”pgustavo@simulandlabs.com”,”ExternalAccess”:false,”ResultStatus”:”Succeeded”,”Id”:”699e0b10-1c53-403e-976f-ce0847a92b44”,”LogonUserSid”:”S-1-5-21-1825954961-3338807533-2873504967-26087451”,”MailboxGuid”:”d0c5f8ae-9ed7-4e46-bfdf-ea1460f5a31b”,”LogonType”:0,”OperationProperties”:[“@{Value=Bind; Name=MailAccessType}”,”@{Value=False; Name=IsThrottled}”],”OperationCount”:7,”AppId”:”00000003-0000-0000-c000-000000000000”,”Folders”:[“@{Id=LgAAAAAM7KyTTmWeRac2KXBEz/7aAQARGHK+grzLTpRJraC1QR6kAAAAAAEMAAAB; Path=\Inbox; FolderItems=System.Object[]}”],”ClientAppId”:”5a95e683-08ad-424e-a441-1d1aec52c02c”},”spnID”:””,”rawData”:{”OrganizationId”:”00000000-0000-0000-0000-000000000000”,”CreationTime”:”2021-08-02T13:32:07.0000000Z”,”RecordType”:50,”Operation”:”MailItemsAccessed”,”UserType”:0,”Workload”:”Exchange”,”Version”:1,”UserKey”:”100320015858B802”,”UserId”:”pgustavo@simulandlabs.com”,”OriginatingServer”:”AB1CD23EF4567 (15.20.4200.000)\r\n”,”InternalLogonType”:0,”OrganizationName”:”simulandlabs.onmicrosoft.com”,”ClientInfoString”:”Client=REST;;”,”MailboxOwnerSid”:”S-1-5-21-1825954961-3338807533-2873504967-26087451”,”ClientIPAddress”:”1.2.3.4”,”MailboxOwnerUPN”:”pgustavo@simulandlabs.com”,”ExternalAccess”:false,”ResultStatus”:”Succeeded”,”Id”:”699e0b10-1c53-403e-976f-ce0847a92b44”,”LogonUserSid”:”S-1-5-21-1825954961-3338807533-2873504967-26087451”,”MailboxGuid”:”d0c5f8ae-9ed7-4e46-bfdf-ea1460f5a31b”,”LogonType”:0,”OperationProperties”:[“@{Value=Bind; Name=MailAccessType}”,”@{Value=False; Name=IsThrottled}”],”OperationCount”:7,”AppId”:”00000003-0000-0000-c000-000000000000”,”Folders”:[“@{Id=LgAAAAAM7KyTTmWeRac2KXBEz/7aAQARGHK+grzLTpRJraC1QR6kAAAAAAEMAAAB; Path=\Inbox; FolderItems=System.Object[]}”],”ClientAppId”:”5a95e683-08ad-424e-a441-1d1aec52c02c”},”AppId”:”00000003-0000-0000-c000-000000000000”,”OAuthAppId”:”5a95e683-08ad-424e-a441-1d1aec52c02c”,”TargetAccountUpn”:””,”TargetAccountDisplayName”:””,”TargetDeviceName”:””,”DestinationDeviceName”:””,”DestinationIPAddress”:””,”DestinationPort”:null,”Protocol”:””,”AccountName”:””,”AccountDomain”:””,”AccountUpn”:””,”AccountSid”:””,”DeviceName”:””,”Port”:null,”Location”:””}

tokens[:10]
['▁{"', 'Tim', 'estamp', '":', '▁"', '2', '021', '-', '08', '-']
tokens[20:30]
['Action', 'Type', '":', '▁"', 'Mail', 'Items', 'Acc', 'essed', '",', '▁"']
import torch

# Convert tokens to input IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)

# Create a PyTorch tensor
input_ids = torch.tensor(input_ids).unsqueeze(0)  # Add a batch dimension

Generate Embeddings for One Event Log#

from transformers import BigBirdModel

model = BigBirdModel.from_pretrained("google/bigbird-roberta-base")

# Get BigBird Embeddings
with torch.no_grad():
    outputs = model(input_ids)

embeddings = outputs.last_hidden_state
embeddings
tensor([[[ 0.1869,  0.0935, -0.0685,  ..., -0.0211, -0.1009, -0.0862],
         [ 0.0773, -0.1688, -0.1072,  ...,  0.1626,  0.1574,  0.1664],
         [ 0.1790, -0.0747,  0.0009,  ...,  0.2164,  0.1874,  0.1451],
         ...,
         [ 0.1302, -0.0763, -0.1211,  ...,  0.0049, -0.3677, -0.0096],
         [ 0.3301, -0.5417, -0.3299,  ..., -0.2153, -0.4639, -0.1000],
         [ 0.1222, -0.2770, -0.0157,  ...,  0.0253, -0.0452,  0.0553]]])
len(embeddings[0])
1708