Local: Tokenizing and Embedding#
Collaborators:
Roberto Rodriguez (@Cyb3rWard0g)
References:
Basic Example#
Tokenizing with BigBird Model Tokenizer#
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
# Tokenize an event log
tokens = tokenizer.tokenize("Hello, my name is Roberto")
tokens
['▁Hello', ',', '▁my', '▁name', '▁is', '▁Roberto']
Get Index IDs for the Tokens in the Vocabulary#
import torch
# Convert tokens to input IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens,)
# Create a PyTorch tensor
input_ids = torch.tensor(input_ids).unsqueeze(0) # Add a batch dimension
input_ids
tensor([[18536, 112, 717, 1539, 419, 32177]])
Get Embeddings with BigBird Model#
from transformers import BigBirdConfig, BigBirdModel
# Define a configuration with "original_full" attention
model_config = BigBirdConfig.from_pretrained("google/bigbird-roberta-base")
model_config.attention_type = "original_full"
model = BigBirdModel.from_pretrained("google/bigbird-roberta-base", config=model_config)
# Get BigBird Embeddings
with torch.no_grad():
outputs = model(input_ids)
embeddings = outputs.last_hidden_state
print(f"Token: {tokens[0]}\nIndex ID: {input_ids[0][0]}\nEmbedding: {embeddings[0][0]} \n")
Token: ▁Hello
Index ID: 18536
Embedding: tensor([-1.1636e-01, -2.0380e-02, -6.1083e-02, -3.8545e-01, 5.7417e-01,
1.0884e-01, -1.0427e-01, 9.3820e-02, -5.6838e-02, -3.1793e-01,
6.8207e-02, -2.5721e-02, 6.7848e-02, -9.3928e-02, 1.1609e-01,
1.4538e-01, 2.2018e-02, -5.3071e-02, 7.6467e-03, 4.7149e-02,
-1.6713e-01, 1.7986e-01, -7.9160e-02, -1.1746e-02, -2.7829e-01,
2.2447e-01, -2.6268e-02, 1.2537e-01, 2.0905e-02, -1.1724e-01,
2.1016e-01, 2.1734e-01, 3.0937e-02, 2.7159e-01, -2.1522e-01,
3.5828e-01, 1.3691e-01, 1.3992e-02, 2.4448e-02, 1.2941e-01,
3.6535e-01, 2.9806e-01, 4.1460e-02, 4.4437e-02, -2.3589e-01,
2.3958e-01, 1.0241e-01, -1.2554e-01, 1.6656e-02, -6.7489e-02,
-1.3410e-01, -1.0509e-01, 1.7687e-01, -1.3179e-02, -7.9537e-02,
1.3339e-01, -1.1378e-01, 4.0585e-02, 7.6636e-02, -4.1417e-01,
2.2140e-01, -9.4216e-02, -3.1294e-01, 3.3909e-02, -5.8064e-02,
5.5965e-03, 1.0053e-01, 7.6107e-02, -1.6494e-01, 1.0267e-01,
-8.9715e-02, -2.3077e-01, -5.9641e-04, -1.6648e-01, 1.3973e-01,
6.6399e-02, 5.1237e-02, -6.1343e-01, 7.7089e-02, 1.9489e-01,
1.6679e-01, -3.6878e-01, 4.0401e-01, 4.3976e-02, 7.0080e-02,
-1.1072e-01, -3.1902e-01, 5.7557e-02, 1.5039e-01, 3.7570e-01,
-6.5676e-03, -1.8109e-01, -2.6733e-02, 6.4456e-02, -1.5362e-02,
-3.9735e-01, -1.7642e-01, -8.8574e-02, 8.2169e-02, -2.3364e-01,
-9.7785e-03, 5.4608e-02, 6.5257e-02, 1.3635e-01, 3.4155e-01,
7.6657e-02, -3.1519e-01, -2.6139e-01, -2.7790e-03, 8.6237e-02,
-2.3414e-02, -1.5799e-02, 9.2869e-02, 1.5318e-01, -2.2860e-01,
7.2682e-02, -1.6873e-01, -3.0805e-01, 5.8212e-03, 3.4121e-02,
-5.9611e-02, -2.1086e-01, -3.6269e-02, 1.1456e-01, 7.7441e-02,
4.0523e-02, 1.2661e-01, 2.6328e-01, 1.1657e-01, -5.4348e-03,
-6.1856e-02, -1.2447e-01, 2.9078e-01, 4.8988e-02, 2.3876e-02,
-9.2600e-02, 2.2589e-01, -7.1960e-03, -2.0627e-01, -1.8147e-02,
5.5104e-02, 1.0189e-01, 1.9438e-01, 4.5833e-02, -9.2392e-03,
-1.6446e-01, -2.5202e-01, -3.1434e-01, 4.2671e-02, 1.3502e-01,
9.6235e-02, 3.9788e-02, -1.8269e-01, -3.1492e-01, -6.6831e-02,
1.1559e+00, -2.9708e-02, 1.5390e-01, 1.1358e-01, 9.6380e-02,
2.5466e-01, 1.4219e-01, 1.7253e-01, -6.7271e-02, -8.9888e-02,
8.1218e-02, -2.2050e-01, 1.5366e-01, -2.7897e-01, -8.3203e-02,
-1.1591e-01, 2.7777e-01, -1.3716e-01, -2.6728e-01, 1.5150e-02,
-6.6391e-02, 2.4254e-01, 7.7544e-02, -1.4343e-01, -1.4653e-01,
-8.9712e-02, 5.6309e-02, 3.3134e-01, -2.8285e-01, 1.8264e-01,
3.4600e-01, 4.7534e-01, 3.5328e-02, -7.8802e-02, 1.7221e-01,
1.3720e-01, 1.1759e-01, -8.6466e-02, -1.2543e-02, -6.5371e-02,
1.3755e-01, -2.9762e-02, -2.4923e-01, 1.5668e-01, -9.0752e-02,
8.2167e-03, -1.6505e-02, 1.6438e-01, 1.8271e-01, 2.1629e-01,
-2.0046e-01, 2.3587e-01, 8.4096e-02, -8.9142e-02, 2.1799e-01,
-4.7325e-02, -1.8031e-01, -1.3593e-01, 1.1050e-01, 2.8516e-02,
-7.0048e-02, 2.3932e-01, -5.8864e-02, 4.4210e-02, -1.9005e-02,
4.1998e-01, -1.2285e-01, 1.4732e-01, 1.8190e-01, 1.8617e-01,
1.2658e-01, 2.3555e-01, -1.0678e-01, 2.9762e-01, 1.0807e-01,
-1.3972e-01, 1.0254e-01, 2.8220e-02, -1.0848e-03, -2.9666e-02,
-1.1325e-01, -1.6394e-01, -1.0015e-01, -2.6119e-01, -1.5041e-01,
3.7000e-02, 1.3124e-01, -1.1700e-01, 1.5929e-01, 7.0666e-02,
-1.4327e-01, -4.5101e-02, 7.7282e-02, -9.3982e-02, 1.0810e-01,
-7.8679e-02, -4.2005e-01, 6.4883e-02, -3.6853e-01, 1.3792e-01,
9.9273e-02, 7.7822e-02, 8.1490e-03, -2.6939e-01, -2.5332e-01,
-1.5344e-01, 3.5467e-01, -7.4576e-02, 1.4070e-01, -1.6303e-01,
-4.5512e-01, 3.8096e-03, 1.4180e-01, 2.1611e-01, -1.3778e-01,
1.3915e-02, 8.8185e-02, 8.5387e-02, -1.0825e-02, -1.1628e-01,
1.5133e-01, 5.1713e-02, -3.6666e-01, 8.9856e-03, 6.1384e-02,
3.2566e-01, 1.5698e-01, 4.5102e-02, -9.1748e-02, 6.1022e-03,
6.7989e-02, -1.7882e-01, -4.2545e-01, 3.3579e-02, 2.6070e-01,
1.9517e-01, 1.6657e-01, 5.0921e-02, -4.0859e-02, -2.3622e-01,
1.1225e-01, 1.2212e-01, -3.4605e-02, 8.1477e-02, -1.2558e-01,
1.1156e-01, 3.6761e-03, -1.3694e-01, -2.2334e-01, 1.1985e-01,
-2.6214e-02, 4.3161e-02, -1.5386e-01, 9.9220e-02, -5.5973e-02,
-1.9602e-01, 1.4341e-01, 1.0897e-01, 3.7060e-01, -2.3487e-02,
-1.0029e-01, 1.0519e-01, 2.2279e-01, 4.9273e-02, -8.8852e-02,
-1.4109e-01, 3.5998e-03, 8.2714e-02, -7.5794e-02, 2.3602e-03,
1.1261e-01, 4.3421e-02, 1.7652e-01, -1.0864e-01, 8.2341e-01,
-3.4869e-03, 1.0346e-03, 3.5852e-01, 2.1665e-01, -2.8769e-01,
7.5744e-02, 1.3908e-01, 1.1003e-01, 4.5481e-01, 1.9171e-01,
1.4931e-01, 1.4257e-01, -2.3037e-01, 1.3965e-01, -1.8462e-02,
1.0343e-01, -8.1092e-02, -1.8032e-01, -1.8582e-02, 4.3680e-01,
-6.0520e-02, -1.1731e-02, 6.8673e-02, -1.7749e-02, 6.9948e-02,
4.6084e-02, -1.1374e-01, 2.1613e-01, 1.1426e-01, -4.4291e-01,
2.6911e-01, 2.8104e-01, -1.6549e-01, 1.0510e-01, -3.8595e-01,
-1.2112e-01, 3.1570e-01, 3.2223e-02, 2.3265e-03, 2.2583e-01,
2.2892e-01, -3.5669e-02, 5.5374e-02, -4.6082e-01, -6.9463e-02,
-3.1969e-01, 1.4195e-01, -1.0212e-01, 2.4094e-01, -6.6254e-02,
-6.5807e-02, -9.6821e-02, 2.3727e-01, -1.5682e-01, -1.8467e-01,
-2.6433e-01, -1.2356e-01, 2.8136e-02, -3.8417e-02, -2.2588e-01,
8.7069e-02, 1.5041e-01, -1.1330e-02, 2.0777e-01, -5.2667e-02,
-3.5839e-01, -2.4291e-01, -3.8074e-02, 2.3650e-01, -6.6348e-02,
3.6094e-02, 2.4189e-01, 1.5915e-01, 2.1828e-01, -2.4702e-01,
2.0885e-01, 3.0356e-01, -8.4988e-02, -2.0408e-01, 4.1214e-02,
2.7640e-02, 1.0529e-01, -1.0015e-01, -7.5593e-02, 2.5404e-01,
-3.8558e-02, -9.3942e-02, 9.6083e-02, -6.8022e-02, -1.6198e-01,
-2.6486e-02, 2.4777e-01, 2.6740e-01, 1.5189e-01, 1.9176e-01,
3.3901e-02, 2.1967e-01, -4.1814e-01, -1.1531e-02, -2.7577e-01,
3.1706e-01, 2.2221e-01, 1.5919e-01, -1.2716e-01, 7.8425e-02,
-2.4036e-01, -2.4433e-01, -1.0049e-01, -1.7044e-01, 4.4139e-02,
-1.4504e-01, -7.0745e-02, -2.1589e-01, -1.0893e-01, 5.3016e-01,
6.8166e-02, -1.3483e-01, 1.2181e-01, 3.1380e-01, -2.5864e-01,
2.6604e-01, 8.9654e-03, -7.6018e-02, -1.3141e+00, -1.1336e-01,
-6.3815e-02, 1.9731e-01, 2.5545e-01, -5.8033e-02, 1.5419e-01,
-1.0425e-01, -9.3440e-02, -9.6354e-02, 5.3156e-02, -2.6236e-01,
-2.7334e-01, 5.4305e-02, 2.1077e-01, -1.5582e-01, 1.1502e-01,
-7.2804e-02, -1.2972e-01, -3.9262e-02, -1.7156e-01, 2.6468e-01,
-1.1975e-02, 2.6765e-01, 2.8969e-01, -1.2662e-01, -3.0494e-01,
3.2054e-02, 1.3120e-01, -5.3662e-02, -1.3033e-01, -3.3487e-02,
6.6659e-02, -1.2525e-01, 6.1176e-02, 3.5805e-02, 7.7177e-02,
3.6847e-02, -7.3659e-02, 7.1801e-03, 2.2346e-01, -3.6041e-01,
3.2930e-01, -3.6310e-01, 2.0905e-01, 1.1914e-01, 1.8711e-02,
3.6424e-02, -2.1259e-01, 1.7280e-01, -1.8592e-01, -9.4130e-03,
1.7409e-01, -1.9176e-01, 1.5465e-01, 5.4870e-02, -1.1879e-01,
-1.3106e-01, -7.0900e-02, 4.3117e-02, 9.8918e-03, 1.8036e-02,
-8.9139e-02, 3.7314e-02, -4.4075e-02, -1.4569e-01, 9.1170e-02,
1.4312e-01, -5.3165e-02, 1.1121e-03, 2.0978e-01, -9.8794e-03,
4.2158e-02, 2.7332e-01, 2.5068e-01, 2.3489e-01, -2.4546e-01,
4.3379e-01, -2.2498e-01, -3.6471e-02, 2.6790e-01, -1.5893e-01,
-7.0857e-02, 1.0994e-01, 1.4304e-01, -2.0314e-01, 1.5672e-01,
-1.0299e-01, -1.2334e-01, 9.4673e-02, -4.7860e-02, 1.8639e-01,
-1.8374e-01, -9.9229e-02, 1.9476e-01, 2.6587e-01, -1.1859e-02,
-7.1661e-02, 8.5858e-01, -1.4662e-01, -1.3256e-01, 9.0016e-02,
2.8409e-01, -9.3416e-02, 1.3053e-01, 1.9137e-02, 1.4655e-01,
-8.9044e-02, 6.3552e-02, 5.1784e-03, -1.0393e-01, 6.8222e-03,
-1.0571e-01, -3.6165e-02, 1.3839e-01, -6.1466e-02, 4.1807e-02,
1.6402e-01, 5.7555e-01, 4.0541e-02, -1.9156e-01, 2.6890e-01,
8.7065e-02, 3.4070e-01, 5.1472e-02, -1.1112e-01, 3.1804e-01,
-1.2821e-01, -2.1103e-01, 1.1119e-01, -2.2352e-01, -1.6754e-01,
1.2009e-01, 7.7462e-02, 3.1534e-02, 1.4148e+01, 6.7538e-02,
8.4337e-03, 1.8661e-01, -2.0979e-01, 1.2481e-01, -1.5175e-02,
-3.3540e-02, -1.1672e-01, 1.3878e-01, 2.7092e-02, -1.8306e-01,
-5.7783e-02, -2.4391e-01, 9.1538e-02, -8.8047e-02, 4.8807e-02,
-5.5185e-02, 1.1760e-01, -1.5016e-03, 2.8545e-01, 3.0705e-02,
-2.0540e-01, 5.9746e-02, -3.2546e-04, 3.3889e-01, 2.3636e-01,
-1.1071e-01, -1.3425e-01, -9.5960e-02, -2.3532e-01, 2.0827e-01,
1.0364e-01, 1.1353e-01, 2.0688e-01, -2.8800e-01, 3.4962e-01,
2.0934e-01, 1.4722e-01, 5.1247e-02, 7.2226e-02, -1.4812e-01,
4.0343e-02, 3.2904e-01, 2.2009e-01, 2.2765e-01, 1.7634e-01,
7.4393e-02, 2.7124e-02, -8.8036e-02, 2.7232e-02, 1.2098e-01,
2.1667e-01, -1.3163e-01, -1.9002e-01, 6.3883e-02, 3.7940e-02,
7.7639e-02, -3.2495e-02, -1.4721e-01, 2.3456e-03, 1.0726e-01,
2.6602e-01, 2.9699e-02, -1.0600e-01, -1.7380e-01, 4.2345e-01,
2.1489e-01, 1.7938e-02, 1.0345e-01, 9.4192e-02, 1.3841e-01,
-9.0320e-02, 8.9497e-02, 4.8507e-02, -4.2748e-01, -1.4755e-01,
-4.5336e-02, -2.6415e-01, 1.5165e-01, 1.2128e-02, -8.0591e-03,
-1.2446e-01, -1.8867e-01, -6.1890e-02, 3.0580e-01, -1.1294e-01,
2.5704e-02, -2.0962e-02, -2.4601e-01, -1.3209e-01, 2.7346e-01,
2.3948e-02, -2.1149e-01, -1.3995e-01, -1.1723e-01, 2.8653e-02,
-9.8611e-02, 3.7482e-01, -1.0707e-01, -1.4909e-01, -1.2980e-02,
-2.7651e-01, 2.6002e-01, -2.2535e-01, 9.1586e-02, -1.0252e-01,
2.9707e-01, 1.0359e-01, 9.4246e-02, -1.4140e-01, -2.0433e-01,
1.3755e-01, 8.4514e-02, -3.8167e-02, -1.6627e-01, -2.6318e-01,
-1.7245e-01, 1.7124e-01, -2.2904e-01, 1.6619e-01, 8.5417e-02,
-2.1021e-01, -8.6555e-02, -8.8081e-02, 7.7535e-02, -4.9845e-02,
-1.2976e-01, -3.9131e-02, -1.0433e-03, -3.5251e-02, 2.2621e-02,
1.2982e-01, -1.1672e-01, 8.4644e-02, 8.5460e-03, -5.6842e-02,
-1.2801e-01, 5.1929e-01, 4.3736e-01, 8.5156e-02, -1.4027e-01,
1.6127e-01, -3.8899e-01, -3.6612e-01, -5.7083e-02, -2.0425e-02,
-5.9496e-02, 6.1161e-02, -3.4648e-01, 7.1157e-02, -7.4365e-02,
1.7982e-02, 3.6682e-02, -5.8530e-02, 2.4198e-01, -9.8698e-04,
-4.1814e-02, 1.2658e-01, -2.7362e-02, 1.7819e-01, 4.7886e-02,
4.8654e-01, -1.4443e-02, 1.1563e-01, -2.4416e-01, 2.1088e-02,
-1.7257e-01, -4.1795e-02, -5.8262e-02, 1.0879e-01, -3.8052e-02,
1.2507e-01, 5.6900e-02, 3.1752e-02, -1.3258e-02, 2.6444e-01,
-1.6823e-01, -9.8095e-03, -3.2275e-01])
# Define the number of elements to display from each embedding
num_display_elements = 5
# Display a back-to-back comparison with a subset of each embedding
for i in range(len(tokens)):
subset_embedding = embeddings[0][i][:num_display_elements]
print(f"Token: {tokens[i]}\nIndex ID: {input_ids[0][i]}\nEmbedding (subset): {subset_embedding}\n")
Token: ▁Hello
Index ID: 18536
Embedding (subset): tensor([-0.1164, -0.0204, -0.0611, -0.3855, 0.5742])
Token: ,
Index ID: 112
Embedding (subset): tensor([-0.1171, -0.0621, -0.0458, -0.3915, 0.6032])
Token: ▁my
Index ID: 717
Embedding (subset): tensor([ 0.1982, 0.1183, 0.1312, -0.3132, 0.4452])
Token: ▁name
Index ID: 1539
Embedding (subset): tensor([ 0.4725, 0.5227, 0.2076, -0.0177, 0.5755])
Token: ▁is
Index ID: 419
Embedding (subset): tensor([ 0.2397, 0.5524, -0.1053, -0.2849, 0.1071])
Token: ▁Roberto
Index ID: 32177
Embedding (subset): tensor([ 0.0671, 0.2040, -0.0366, -0.3737, 0.0768])
Tokenizing and Embedding a Security Event Log#
Download Data Sample#
import os
repository_url = 'https://github.com/OTRF/Security-Datasets/raw/master/datasets/compound/GoldenSAMLADFSMailAccess/Microsoft365DefenderEvents.Zip'
output_directory = './' # Destination directory for unzipped files
# Extract the filename from the URL
file_name = repository_url.split('/')[-1]
# Check if the file already exists in the output directory
download_required = not os.path.exists(os.path.join(output_directory, file_name))
if download_required:
import requests
import zipfile
import io
# Download the zip file
response = requests.get(repository_url)
if response.status_code == 200:
# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
os.makedirs(output_directory)
# Unzip the downloaded file to the output directory
with zipfile.ZipFile(io.BytesIO(response.content), 'r') as zip_ref:
zip_ref.extractall(output_directory)
print(f"Downloaded and unzipped '{file_name}' to '{output_directory}'")
else:
print(f"Failed to download '{file_name}' from GitHub. Status code: {response.status_code}")
else:
print(f"File '{file_name}' already exists in '{output_directory}', no need to download.")
File 'Microsoft365DefenderEvents.Zip' already exists in './', no need to download.
Read Security Event Logs#
import json
file_path = "Microsoft365DefenderEvents.json"
# Open and read the JSON file
with open(file_path, 'r') as file:
data = file.read()
# Parse the JSON data
json_data = [json.loads(line) for line in data.splitlines()]
json_data[7]
{'Timestamp': '2021-08-02T13:32:07Z',
'ActionType': 'MailItemsAccessed',
'Application': 'Microsoft Exchange Online',
'ApplicationId': 20893,
'AccountObjectId': '5a95e683-08ad-424e-a441-1d1aec52c02c',
'AccountDisplayName': 'SimuLandApp',
'IsAdminOperation': 0,
'DeviceType': 'Other',
'OSPlatform': 'Unknown',
'IPAddress': '1.2.3.4',
'IsAnonymousProxy': 0,
'CountryCode': 'US',
'City': 'chicago',
'ISP': 'Microsoft 365 Common and Office Online server',
'UserAgent_dynamic': None,
'UserAgent_string': 'Client=REST;;',
'ActivityType': 'Run',
'ActivityObjects': [{'ServiceObjectType': 'Session ID',
'Type': 'Structured object',
'Role': 'Parameter'},
{'Type': 'Task', 'Role': 'Target object', 'Name': 'MailItemsAccessed'},
{'Type': 'Property',
'Role': 'Parameter',
'Name': 'MailAccessType',
'Value': 'Bind'},
{'Type': 'Property',
'Role': 'Parameter',
'Name': 'IsThrottled',
'Value': 'False'},
{'ApplicationInstance': 0,
'ApplicationId': 11161,
'Type': 'User',
'Role': 'Parameter',
'Name': 'Gustavo Pedro',
'Id': 'aead923d-498b-4f64-a66c-2af91447a8b6'},
{'ApplicationInstance': 0,
'ApplicationId': 11161,
'Type': 'Account',
'Role': 'Actor',
'Name': 'SimuLandApp',
'Id': '5a95e683-08ad-424e-a441-1d1aec52c02c'}],
'ObjectName': 'MailItemsAccessed',
'ObjectType': 'Task',
'ObjectId': '',
'ReportId': '106830890_20893_699e0b10-1c53-403e-976f-ce0847a92b44',
'AdditionalFields': {'IsSatelliteProvider': False},
'UserId': '',
'Permissions': None,
'PermissionsAddedTo': '',
'RawEventData': {'OrganizationId': '00000000-0000-0000-0000-000000000000',
'CreationTime': '2021-08-02T13:32:07.0000000Z',
'RecordType': 50,
'Operation': 'MailItemsAccessed',
'UserType': 0,
'Workload': 'Exchange',
'Version': 1,
'UserKey': '100320015858B802',
'UserId': 'pgustavo@simulandlabs.com',
'OriginatingServer': 'AB1CD23EF4567 (15.20.4200.000)\r\n',
'InternalLogonType': 0,
'OrganizationName': 'simulandlabs.onmicrosoft.com',
'ClientInfoString': 'Client=REST;;',
'MailboxOwnerSid': 'S-1-5-21-1825954961-3338807533-2873504967-26087451',
'ClientIPAddress': '1.2.3.4',
'MailboxOwnerUPN': 'pgustavo@simulandlabs.com',
'ExternalAccess': False,
'ResultStatus': 'Succeeded',
'Id': '699e0b10-1c53-403e-976f-ce0847a92b44',
'LogonUserSid': 'S-1-5-21-1825954961-3338807533-2873504967-26087451',
'MailboxGuid': 'd0c5f8ae-9ed7-4e46-bfdf-ea1460f5a31b',
'LogonType': 0,
'OperationProperties': ['@{Value=Bind; Name=MailAccessType}',
'@{Value=False; Name=IsThrottled}'],
'OperationCount': 7,
'AppId': '00000003-0000-0000-c000-000000000000',
'Folders': ['@{Id=LgAAAAAM7KyTTmWeRac2KXBEz/7aAQARGHK+grzLTpRJraC1QR6kAAAAAAEMAAAB; Path=\\Inbox; FolderItems=System.Object[]}'],
'ClientAppId': '5a95e683-08ad-424e-a441-1d1aec52c02c'},
'spnID': '',
'rawData': {'OrganizationId': '00000000-0000-0000-0000-000000000000',
'CreationTime': '2021-08-02T13:32:07.0000000Z',
'RecordType': 50,
'Operation': 'MailItemsAccessed',
'UserType': 0,
'Workload': 'Exchange',
'Version': 1,
'UserKey': '100320015858B802',
'UserId': 'pgustavo@simulandlabs.com',
'OriginatingServer': 'AB1CD23EF4567 (15.20.4200.000)\r\n',
'InternalLogonType': 0,
'OrganizationName': 'simulandlabs.onmicrosoft.com',
'ClientInfoString': 'Client=REST;;',
'MailboxOwnerSid': 'S-1-5-21-1825954961-3338807533-2873504967-26087451',
'ClientIPAddress': '1.2.3.4',
'MailboxOwnerUPN': 'pgustavo@simulandlabs.com',
'ExternalAccess': False,
'ResultStatus': 'Succeeded',
'Id': '699e0b10-1c53-403e-976f-ce0847a92b44',
'LogonUserSid': 'S-1-5-21-1825954961-3338807533-2873504967-26087451',
'MailboxGuid': 'd0c5f8ae-9ed7-4e46-bfdf-ea1460f5a31b',
'LogonType': 0,
'OperationProperties': ['@{Value=Bind; Name=MailAccessType}',
'@{Value=False; Name=IsThrottled}'],
'OperationCount': 7,
'AppId': '00000003-0000-0000-c000-000000000000',
'Folders': ['@{Id=LgAAAAAM7KyTTmWeRac2KXBEz/7aAQARGHK+grzLTpRJraC1QR6kAAAAAAEMAAAB; Path=\\Inbox; FolderItems=System.Object[]}'],
'ClientAppId': '5a95e683-08ad-424e-a441-1d1aec52c02c'},
'AppId': '00000003-0000-0000-c000-000000000000',
'OAuthAppId': '5a95e683-08ad-424e-a441-1d1aec52c02c',
'TargetAccountUpn': '',
'TargetAccountDisplayName': '',
'TargetDeviceName': '',
'DestinationDeviceName': '',
'DestinationIPAddress': '',
'DestinationPort': None,
'Protocol': '',
'AccountName': '',
'AccountDomain': '',
'AccountUpn': '',
'AccountSid': '',
'DeviceName': '',
'Port': None,
'Location': ''}
Tokenize One Event Log#
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
# Tokenize an event log
tokens = tokenizer.tokenize(json.dumps(json_data[7]))
Raw Example Log - Index 7#
{“Timestamp”:”2021-08-02T13:32:07Z”,”ActionType”:”MailItemsAccessed”,”Application”:”Microsoft Exchange Online”,”ApplicationId”:20893,”AccountObjectId”:”5a95e683-08ad-424e-a441-1d1aec52c02c”,”AccountDisplayName”:”SimuLandApp”,”IsAdminOperation”:0,”DeviceType”:”Other”,”OSPlatform”:”Unknown”,”IPAddress”:”1.2.3.4”,”IsAnonymousProxy”:0,”CountryCode”:”US”,”City”:”chicago”,”ISP”:”Microsoft 365 Common and Office Online server”,”UserAgent_dynamic”:null,”UserAgent_string”:”Client=REST;;”,”ActivityType”:”Run”,”ActivityObjects”:[{“ServiceObjectType”:”Session ID”,”Type”:”Structured object”,”Role”:”Parameter”},{“Type”:”Task”,”Role”:”Target object”,”Name”:”MailItemsAccessed”},{“Type”:”Property”,”Role”:”Parameter”,”Name”:”MailAccessType”,”Value”:”Bind”},{“Type”:”Property”,”Role”:”Parameter”,”Name”:”IsThrottled”,”Value”:”False”},{“ApplicationInstance”:0,”ApplicationId”:11161,”Type”:”User”,”Role”:”Parameter”,”Name”:”Gustavo Pedro”,”Id”:”aead923d-498b-4f64-a66c-2af91447a8b6”},{“ApplicationInstance”:0,”ApplicationId”:11161,”Type”:”Account”,”Role”:”Actor”,”Name”:”SimuLandApp”,”Id”:”5a95e683-08ad-424e-a441-1d1aec52c02c”}],”ObjectName”:”MailItemsAccessed”,”ObjectType”:”Task”,”ObjectId”:””,”ReportId”:”106830890_20893_699e0b10-1c53-403e-976f-ce0847a92b44”,”AdditionalFields”:{“IsSatelliteProvider”:false},”UserId”:””,”Permissions”:null,”PermissionsAddedTo”:””,”RawEventData”:{”OrganizationId”:”00000000-0000-0000-0000-000000000000”,”CreationTime”:”2021-08-02T13:32:07.0000000Z”,”RecordType”:50,”Operation”:”MailItemsAccessed”,”UserType”:0,”Workload”:”Exchange”,”Version”:1,”UserKey”:”100320015858B802”,”UserId”:”pgustavo@simulandlabs.com”,”OriginatingServer”:”AB1CD23EF4567 (15.20.4200.000)\r\n”,”InternalLogonType”:0,”OrganizationName”:”simulandlabs.onmicrosoft.com”,”ClientInfoString”:”Client=REST;;”,”MailboxOwnerSid”:”S-1-5-21-1825954961-3338807533-2873504967-26087451”,”ClientIPAddress”:”1.2.3.4”,”MailboxOwnerUPN”:”pgustavo@simulandlabs.com”,”ExternalAccess”:false,”ResultStatus”:”Succeeded”,”Id”:”699e0b10-1c53-403e-976f-ce0847a92b44”,”LogonUserSid”:”S-1-5-21-1825954961-3338807533-2873504967-26087451”,”MailboxGuid”:”d0c5f8ae-9ed7-4e46-bfdf-ea1460f5a31b”,”LogonType”:0,”OperationProperties”:[“@{Value=Bind; Name=MailAccessType}”,”@{Value=False; Name=IsThrottled}”],”OperationCount”:7,”AppId”:”00000003-0000-0000-c000-000000000000”,”Folders”:[“@{Id=LgAAAAAM7KyTTmWeRac2KXBEz/7aAQARGHK+grzLTpRJraC1QR6kAAAAAAEMAAAB; Path=\Inbox; FolderItems=System.Object[]}”],”ClientAppId”:”5a95e683-08ad-424e-a441-1d1aec52c02c”},”spnID”:””,”rawData”:{”OrganizationId”:”00000000-0000-0000-0000-000000000000”,”CreationTime”:”2021-08-02T13:32:07.0000000Z”,”RecordType”:50,”Operation”:”MailItemsAccessed”,”UserType”:0,”Workload”:”Exchange”,”Version”:1,”UserKey”:”100320015858B802”,”UserId”:”pgustavo@simulandlabs.com”,”OriginatingServer”:”AB1CD23EF4567 (15.20.4200.000)\r\n”,”InternalLogonType”:0,”OrganizationName”:”simulandlabs.onmicrosoft.com”,”ClientInfoString”:”Client=REST;;”,”MailboxOwnerSid”:”S-1-5-21-1825954961-3338807533-2873504967-26087451”,”ClientIPAddress”:”1.2.3.4”,”MailboxOwnerUPN”:”pgustavo@simulandlabs.com”,”ExternalAccess”:false,”ResultStatus”:”Succeeded”,”Id”:”699e0b10-1c53-403e-976f-ce0847a92b44”,”LogonUserSid”:”S-1-5-21-1825954961-3338807533-2873504967-26087451”,”MailboxGuid”:”d0c5f8ae-9ed7-4e46-bfdf-ea1460f5a31b”,”LogonType”:0,”OperationProperties”:[“@{Value=Bind; Name=MailAccessType}”,”@{Value=False; Name=IsThrottled}”],”OperationCount”:7,”AppId”:”00000003-0000-0000-c000-000000000000”,”Folders”:[“@{Id=LgAAAAAM7KyTTmWeRac2KXBEz/7aAQARGHK+grzLTpRJraC1QR6kAAAAAAEMAAAB; Path=\Inbox; FolderItems=System.Object[]}”],”ClientAppId”:”5a95e683-08ad-424e-a441-1d1aec52c02c”},”AppId”:”00000003-0000-0000-c000-000000000000”,”OAuthAppId”:”5a95e683-08ad-424e-a441-1d1aec52c02c”,”TargetAccountUpn”:””,”TargetAccountDisplayName”:””,”TargetDeviceName”:””,”DestinationDeviceName”:””,”DestinationIPAddress”:””,”DestinationPort”:null,”Protocol”:””,”AccountName”:””,”AccountDomain”:””,”AccountUpn”:””,”AccountSid”:””,”DeviceName”:””,”Port”:null,”Location”:””}
tokens[:10]
['▁{"', 'Tim', 'estamp', '":', '▁"', '2', '021', '-', '08', '-']
tokens[20:30]
['Action', 'Type', '":', '▁"', 'Mail', 'Items', 'Acc', 'essed', '",', '▁"']
import torch
# Convert tokens to input IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# Create a PyTorch tensor
input_ids = torch.tensor(input_ids).unsqueeze(0) # Add a batch dimension
Generate Embeddings for One Event Log#
from transformers import BigBirdModel
model = BigBirdModel.from_pretrained("google/bigbird-roberta-base")
# Get BigBird Embeddings
with torch.no_grad():
outputs = model(input_ids)
embeddings = outputs.last_hidden_state
embeddings
tensor([[[ 0.1869, 0.0935, -0.0685, ..., -0.0211, -0.1009, -0.0862],
[ 0.0773, -0.1688, -0.1072, ..., 0.1626, 0.1574, 0.1664],
[ 0.1790, -0.0747, 0.0009, ..., 0.2164, 0.1874, 0.1451],
...,
[ 0.1302, -0.0763, -0.1211, ..., 0.0049, -0.3677, -0.0096],
[ 0.3301, -0.5417, -0.3299, ..., -0.2153, -0.4639, -0.1000],
[ 0.1222, -0.2770, -0.0157, ..., 0.0253, -0.0452, 0.0553]]])
len(embeddings[0])
1708