Create ATT&CK Groups Source knowledge#


Import Modules#

from attackcti import attack_client
import os
import logging

logging.getLogger('taxii2client').setLevel(logging.CRITICAL)

Define Initial Variables#

# Define a few variables
current_directory = os.path.dirname("__file__")
documents_directory = os.path.join(current_directory, "documents")
contrib_directory = os.path.join(current_directory, "contrib")
embeddings_directory = os.path.join(current_directory, "embeddings")
templates_directory = os.path.join(current_directory, "templates")
group_template = os.path.join(templates_directory, "group.md")

Initialize ATT&CK Client#

lift = attack_client()

Get ATT&CK Groups#

Gettings technique STIX objects used by all groups accross all ATT&CK matrices..

techniques_used_by_groups = lift.get_techniques_used_by_all_groups()
techniques_used_by_groups[0]
{'type': 'intrusion-set',
 'id': 'intrusion-set--b7f627e2-0817-4cd5-8d50-e75f8aa85cc6',
 'created_by_ref': 'identity--c78cb6e5-0c4b-4611-8297-d1b8b55e40b5',
 'created': '2023-02-23T15:31:38.829Z',
 'modified': '2023-04-17T21:49:16.371Z',
 'name': 'LuminousMoth',
 'description': '[LuminousMoth](https://attack.mitre.org/groups/G1014) is a Chinese-speaking cyber espionage group that has been active since at least October 2020. [LuminousMoth](https://attack.mitre.org/groups/G1014) has targeted high-profile organizations, including government entities, in Myanmar, the Philippines, Thailand, and other parts of Southeast Asia. Some security researchers have concluded there is a connection between [LuminousMoth](https://attack.mitre.org/groups/G1014) and [Mustang Panda](https://attack.mitre.org/groups/G0129) based on similar targeting and TTPs, as well as network infrastructure overlaps.(Citation: Kaspersky LuminousMoth July 2021)(Citation: Bitdefender LuminousMoth July 2021)',
 'aliases': ['LuminousMoth'],
 'external_references': [{'source_name': 'mitre-attack',
   'url': 'https://attack.mitre.org/groups/G1014',
   'external_id': 'G1014'},
  {'source_name': 'Bitdefender LuminousMoth July 2021',
   'description': 'Botezatu, B and etl. (2021, July 21). LuminousMoth - PlugX, File Exfiltration and Persistence Revisited. Retrieved October 20, 2022.',
   'url': 'https://www.bitdefender.com/blog/labs/luminousmoth-plugx-file-exfiltration-and-persistence-revisited'},
  {'source_name': 'Kaspersky LuminousMoth July 2021',
   'description': 'Lechtik, M, and etl. (2021, July 14). LuminousMoth APT: Sweeping attacks for the chosen few. Retrieved October 20, 2022.',
   'url': 'https://securelist.com/apt-luminousmoth/103332/'}],
 'object_marking_refs': ['marking-definition--fa42a846-8d90-4e51-bc29-71d5b4802168'],
 'x_mitre_attack_spec_version': '3.1.0',
 'x_mitre_contributors': ['Kyaw Pyiyt Htet, @KyawPyiytHtet',
  'Zaw Min Htun, @Z3TAE'],
 'x_mitre_deprecated': False,
 'x_mitre_domains': ['enterprise-attack'],
 'x_mitre_modified_by_ref': 'identity--c78cb6e5-0c4b-4611-8297-d1b8b55e40b5',
 'x_mitre_version': '1.0',
 'technique_ref': 'attack-pattern--32901740-b42c-4fdd-bc02-345b5dc57082',
 'relationship_description': '[LuminousMoth](https://attack.mitre.org/groups/G1014) has signed their malware with a valid digital signature.(Citation: Kaspersky LuminousMoth July 2021)',
 'relationship_id': 'relationship--eb4ce173-6f0e-4c12-9ff8-09c4fb1ae2d3',
 'revoked': False,
 'technique': 'Code Signing',
 'technique_description': 'Adversaries may create, acquire, or steal code signing materials to sign their malware or tools. Code signing provides a level of authenticity on a binary from the developer and a guarantee that the binary has not been tampered with. (Citation: Wikipedia Code Signing) The certificates used during an operation may be created, acquired, or stolen by the adversary. (Citation: Securelist Digital Certificates) (Citation: Symantec Digital Certificates) Unlike [Invalid Code Signature](https://attack.mitre.org/techniques/T1036/001), this activity will result in a valid signature.\n\nCode signing to verify software on first run can be used on modern Windows and macOS systems. It is not used on Linux due to the decentralized nature of the platform. (Citation: Wikipedia Code Signing)(Citation: EclecticLightChecksonEXECodeSigning)\n\nCode signing certificates may be used to bypass security policies that require signed code to execute on a system. ',
 'tactic': [KillChainPhase(kill_chain_name='mitre-attack', phase_name='defense-evasion')],
 'technique_id': 'T1553.002',
 'matrix': 'mitre-attack',
 'platform': ['macOS', 'Windows'],
 'data_sources': ['File: File Metadata']}

Create ATT&CK Groups Documents#

import copy
from jinja2 import Template

# Create Group docs
all_groups = dict()
for technique in techniques_used_by_groups:
    if technique['id'] not in all_groups:
        group = dict()
        group['group_name'] = technique['name']
        group['group_id'] = technique['external_references'][0]['external_id']
        group['created'] = technique['created']
        group['modified'] = technique['modified']
        group['description'] = technique['description']
        group['aliases'] = technique['aliases']
        if 'x_mitre_contributors' in technique:
            group['contributors'] = technique['x_mitre_contributors']
        group['techniques'] = []
        all_groups[technique['id']] = group
    technique_used = dict()
    technique_used['matrix'] = technique['matrix']
    technique_used['domain'] = technique['x_mitre_domains']
    technique_used['platform'] = technique['platform']
    technique_used['tactics'] = technique['tactic']
    technique_used['technique_id'] = technique['technique_id']
    technique_used['technique_name'] = technique['technique']
    technique_used['use'] = technique['relationship_description']
    if 'data_sources' in technique:
        technique_used['data_sources'] = technique['data_sources']
    all_groups[technique['id']]['techniques'].append(technique_used)

if not os.path.exists(documents_directory):
   print("[+] Creating knowledge directory..")
   os.makedirs(documents_directory)

print("[+] Creating markadown files for each group..")
markdown_template = Template(open(group_template).read())
for key in list(all_groups.keys()):
    group = all_groups[key]
    print("  [>>] Creating markdown file for {}..".format(group['group_name']))
    group_for_render = copy.deepcopy(group)
    markdown = markdown_template.render(metadata=group_for_render, group_name=group['group_name'], group_id=group['group_id'])
    file_name = (group['group_name']).replace(' ','_')
    open(f'{documents_directory}/{file_name}.md', encoding='utf-8', mode='w').write(markdown)
[+] Creating markadown files for each group..
  [>>] Creating markdown file for LuminousMoth..
  [>>] Creating markdown file for Metador..
  [>>] Creating markdown file for CURIUM..
  [>>] Creating markdown file for EXOTIC LILY..
  [>>] Creating markdown file for Moses Staff..
  [>>] Creating markdown file for SideCopy..
  [>>] Creating markdown file for Aoqin Dragon..
  [>>] Creating markdown file for Earth Lusca..
  [>>] Creating markdown file for POLONIUM..
  [>>] Creating markdown file for LAPSUS$..
  [>>] Creating markdown file for Ember Bear..
  [>>] Creating markdown file for BITTER..
  [>>] Creating markdown file for Aquatic Panda..
  [>>] Creating markdown file for Confucius..
  [>>] Creating markdown file for LazyScripter..
  [>>] Creating markdown file for TeamTNT..
  [>>] Creating markdown file for Andariel..
  [>>] Creating markdown file for Ferocious Kitten..
  [>>] Creating markdown file for IndigoZebra..
  [>>] Creating markdown file for BackdoorDiplomacy..
  [>>] Creating markdown file for Transparent Tribe..
  [>>] Creating markdown file for Nomadic Octopus..
  [>>] Creating markdown file for Tonto Team..
  [>>] Creating markdown file for Ajax Security Team..
  [>>] Creating markdown file for Mustang Panda..
  [>>] Creating markdown file for ZIRCONIUM..
  [>>] Creating markdown file for TA551..
  [>>] Creating markdown file for Higaisa..
  [>>] Creating markdown file for HAFNIUM..
  [>>] Creating markdown file for Windigo..
  [>>] Creating markdown file for Volatile Cedar..
  [>>] Creating markdown file for Silent Librarian..
  [>>] Creating markdown file for Sidewinder..
  [>>] Creating markdown file for Evilnum..
  [>>] Creating markdown file for Indrik Spider..
  [>>] Creating markdown file for Fox Kitten..
  [>>] Creating markdown file for GOLD SOUTHFIELD..
  [>>] Creating markdown file for Chimera..
  [>>] Creating markdown file for Windshift..
  [>>] Creating markdown file for Blue Mockingbird..
  [>>] Creating markdown file for Whitefly..
  [>>] Creating markdown file for Rocke..
  [>>] Creating markdown file for DarkVishnya..
  [>>] Creating markdown file for Mofang..
  [>>] Creating markdown file for Wizard Spider..
  [>>] Creating markdown file for Inception..
  [>>] Creating markdown file for APT-C-36..
  [>>] Creating markdown file for BlackTech..
  [>>] Creating markdown file for APT41..
  [>>] Creating markdown file for Machete..
  [>>] Creating markdown file for Kimsuky..
  [>>] Creating markdown file for GALLIUM..
  [>>] Creating markdown file for TA505..
  [>>] Creating markdown file for Silence..
  [>>] Creating markdown file for WIRTE..
  [>>] Creating markdown file for The White Company..
  [>>] Creating markdown file for TEMP.Veles..
  [>>] Creating markdown file for APT39..
  [>>] Creating markdown file for FIN4..
  [>>] Creating markdown file for Gallmaker..
  [>>] Creating markdown file for SilverTerrier..
  [>>] Creating markdown file for APT38..
  [>>] Creating markdown file for Tropic Trooper..
  [>>] Creating markdown file for HEXANE..
  [>>] Creating markdown file for DarkHydrus..
  [>>] Creating markdown file for APT19..
  [>>] Creating markdown file for Cobalt Group..
  [>>] Creating markdown file for Orangeworm..
  [>>] Creating markdown file for Thrip..
  [>>] Creating markdown file for Gorgon Group..
  [>>] Creating markdown file for Rancor..
  [>>] Creating markdown file for Dark Caracal..
  [>>] Creating markdown file for Leafminer..
  [>>] Creating markdown file for Leviathan..
  [>>] Creating markdown file for Elderwood..
  [>>] Creating markdown file for BlackOasis..
  [>>] Creating markdown file for FIN8..
  [>>] Creating markdown file for APT33..
  [>>] Creating markdown file for PLATINUM..
  [>>] Creating markdown file for TA459..
  [>>] Creating markdown file for APT37..
  [>>] Creating markdown file for MuddyWater..
  [>>] Creating markdown file for Magic Hound..
  [>>] Creating markdown file for FIN5..
  [>>] Creating markdown file for PROMETHIUM..
  [>>] Creating markdown file for CopyKittens..
  [>>] Creating markdown file for BRONZE BUTLER..
  [>>] Creating markdown file for Sowbug..
  [>>] Creating markdown file for APT32..
  [>>] Creating markdown file for FIN10..
  [>>] Creating markdown file for OilRig..
  [>>] Creating markdown file for RTM..
  [>>] Creating markdown file for Gamaredon Group..
  [>>] Creating markdown file for FIN7..
  [>>] Creating markdown file for menuPass..
  [>>] Creating markdown file for Winnti Group..
  [>>] Creating markdown file for Group5..
  [>>] Creating markdown file for Strider..
  [>>] Creating markdown file for Patchwork..
  [>>] Creating markdown file for Suckfly..
  [>>] Creating markdown file for Stealth Falcon..
  [>>] Creating markdown file for FIN6..
  [>>] Creating markdown file for GCMAN..
  [>>] Creating markdown file for Dragonfly..
  [>>] Creating markdown file for Sandworm Team..
  [>>] Creating markdown file for Poseidon Group..
  [>>] Creating markdown file for Lazarus Group..
  [>>] Creating markdown file for Scarlet Mimic..
  [>>] Creating markdown file for Threat Group-1314..
  [>>] Creating markdown file for Threat Group-3390..
  [>>] Creating markdown file for APT18..
  [>>] Creating markdown file for APT17..
  [>>] Creating markdown file for Putter Panda..
  [>>] Creating markdown file for APT16..
  [>>] Creating markdown file for APT3..
  [>>] Creating markdown file for Molerats..
  [>>] Creating markdown file for Equation..
  [>>] Creating markdown file for Naikon..
  [>>] Creating markdown file for admin@338..
  [>>] Creating markdown file for APT29..
  [>>] Creating markdown file for APT30..
  [>>] Creating markdown file for Darkhotel..
  [>>] Creating markdown file for PittyTiger..
  [>>] Creating markdown file for Turla..
  [>>] Creating markdown file for Deep Panda..
  [>>] Creating markdown file for Carbanak..
  [>>] Creating markdown file for APT28..
  [>>] Creating markdown file for APT1..
  [>>] Creating markdown file for APT12..
  [>>] Creating markdown file for Ke3chang..
  [>>] Creating markdown file for Cleaver..
  [>>] Creating markdown file for Moafee..
  [>>] Creating markdown file for Axiom..
  [>>] Creating markdown file for ALLANITE..

Index Source Knowledge#

Load Documents#

import glob
from langchain.document_loaders import UnstructuredMarkdownLoader
# variables
group_files = glob.glob(os.path.join(documents_directory, "*.md"))

# Loading Markdown files
md_docs = []
print("[+] Loading Group markdown files..")
for group in group_files:
    print(f' [*] Loading {os.path.basename(group)}')
    loader = UnstructuredMarkdownLoader(group)
    md_docs.extend(loader.load())

print(f'[+] Number of .md documents processed: {len(md_docs)}')
[+] Loading Group markdown files..
 [*] Loading admin@338.md
 [*] Loading Ajax_Security_Team.md
 [*] Loading ALLANITE.md
 [*] Loading Andariel.md
 [*] Loading Aoqin_Dragon.md
 [*] Loading APT-C-36.md
 [*] Loading APT1.md
 [*] Loading APT12.md
 [*] Loading APT16.md
 [*] Loading APT17.md
 [*] Loading APT18.md
 [*] Loading APT19.md
 [*] Loading APT28.md
 [*] Loading APT29.md
 [*] Loading APT3.md
 [*] Loading APT30.md
 [*] Loading APT32.md
 [*] Loading APT33.md
 [*] Loading APT37.md
 [*] Loading APT38.md
 [*] Loading APT39.md
 [*] Loading APT41.md
 [*] Loading Aquatic_Panda.md
 [*] Loading Axiom.md
 [*] Loading BackdoorDiplomacy.md
 [*] Loading BITTER.md
 [*] Loading BlackOasis.md
 [*] Loading BlackTech.md
 [*] Loading Blue_Mockingbird.md
 [*] Loading BRONZE_BUTLER.md
 [*] Loading Carbanak.md
 [*] Loading Chimera.md
 [*] Loading Cleaver.md
 [*] Loading Cobalt_Group.md
 [*] Loading Confucius.md
 [*] Loading CopyKittens.md
 [*] Loading CURIUM.md
 [*] Loading Darkhotel.md
 [*] Loading DarkHydrus.md
 [*] Loading DarkVishnya.md
 [*] Loading Dark_Caracal.md
 [*] Loading Deep_Panda.md
 [*] Loading Dragonfly.md
 [*] Loading Earth_Lusca.md
 [*] Loading Elderwood.md
 [*] Loading Ember_Bear.md
 [*] Loading Equation.md
 [*] Loading Evilnum.md
 [*] Loading EXOTIC_LILY.md
 [*] Loading Ferocious_Kitten.md
 [*] Loading FIN10.md
 [*] Loading FIN4.md
 [*] Loading FIN5.md
 [*] Loading FIN6.md
 [*] Loading FIN7.md
 [*] Loading FIN8.md
 [*] Loading Fox_Kitten.md
 [*] Loading GALLIUM.md
 [*] Loading Gallmaker.md
 [*] Loading Gamaredon_Group.md
 [*] Loading GCMAN.md
 [*] Loading GOLD_SOUTHFIELD.md
 [*] Loading Gorgon_Group.md
 [*] Loading Group5.md
 [*] Loading HAFNIUM.md
 [*] Loading HEXANE.md
 [*] Loading Higaisa.md
 [*] Loading Inception.md
 [*] Loading IndigoZebra.md
 [*] Loading Indrik_Spider.md
 [*] Loading Ke3chang.md
 [*] Loading Kimsuky.md
 [*] Loading LAPSUS$.md
 [*] Loading Lazarus_Group.md
 [*] Loading LazyScripter.md
 [*] Loading Leafminer.md
 [*] Loading Leviathan.md
 [*] Loading LuminousMoth.md
 [*] Loading Machete.md
 [*] Loading Magic_Hound.md
 [*] Loading menuPass.md
 [*] Loading Metador.md
 [*] Loading Moafee.md
 [*] Loading Mofang.md
 [*] Loading Molerats.md
 [*] Loading Moses_Staff.md
 [*] Loading MuddyWater.md
 [*] Loading Mustang_Panda.md
 [*] Loading Naikon.md
 [*] Loading Nomadic_Octopus.md
 [*] Loading OilRig.md
 [*] Loading Orangeworm.md
 [*] Loading Patchwork.md
 [*] Loading PittyTiger.md
 [*] Loading PLATINUM.md
 [*] Loading POLONIUM.md
 [*] Loading Poseidon_Group.md
 [*] Loading PROMETHIUM.md
 [*] Loading Putter_Panda.md
 [*] Loading Rancor.md
 [*] Loading Rocke.md
 [*] Loading RTM.md
 [*] Loading Sandworm_Team.md
 [*] Loading Scarlet_Mimic.md
 [*] Loading SideCopy.md
 [*] Loading Sidewinder.md
 [*] Loading Silence.md
 [*] Loading Silent_Librarian.md
 [*] Loading SilverTerrier.md
 [*] Loading Sowbug.md
 [*] Loading Stealth_Falcon.md
 [*] Loading Strider.md
 [*] Loading Suckfly.md
 [*] Loading TA459.md
 [*] Loading TA505.md
 [*] Loading TA551.md
 [*] Loading TeamTNT.md
 [*] Loading TEMP.Veles.md
 [*] Loading The_White_Company.md
 [*] Loading Threat_Group-1314.md
 [*] Loading Threat_Group-3390.md
 [*] Loading Thrip.md
 [*] Loading Tonto_Team.md
 [*] Loading Transparent_Tribe.md
 [*] Loading Tropic_Trooper.md
 [*] Loading Turla.md
 [*] Loading Volatile_Cedar.md
 [*] Loading Whitefly.md
 [*] Loading Windigo.md
 [*] Loading Windshift.md
 [*] Loading Winnti_Group.md
 [*] Loading WIRTE.md
 [*] Loading Wizard_Spider.md
 [*] Loading ZIRCONIUM.md
[+] Number of .md documents processed: 134

Check a doc page content

print(md_docs[0].page_content)
admin@338 - G0018

Created: 2017-05-31T21:31:53.579Z

Modified: 2020-03-18T19:54:59.120Z

Contributors: Tatsuya Daitoku, Cyber Defense Institute, Inc.

Aliases

admin@338

Description

admin@338 is a China-based cyber threat group. It has previously used newsworthy events as lures to deliver malware and has primarily targeted organizations involved in financial, economic, and trade policy, typically using publicly available RATs such as PoisonIvy, as well as some non-public backdoors. (Citation: FireEye admin@338)

Techniques Used

admin@338 has sent emails with malicious Microsoft Office documents attached.(Citation: FireEye admin@338)|
|mitre-attack|enterprise-attack|Linux,macOS,Windows|T1204.002|Malicious File|

admin@338 has attempted to get victims to launch malicious Microsoft Word attachments delivered via spearphishing emails.(Citation: FireEye admin@338)|
|mitre-attack|enterprise-attack|Linux,Windows,macOS|T1203|Exploitation for Client Execution|

admin@338 has exploited client software vulnerabilities for execution, such as Microsoft Word CVE-2012-0158.(Citation: FireEye admin@338)|
|mitre-attack|enterprise-attack|Linux,macOS,Windows|T1087.001|Local Account|

admin@338 actors used the following commands following exploitation of a machine with

LOWBALL malware to enumerate user accounts:

admin@338 actors used the following command to rename one of their tools to a benign file name:

admin@338 actors used the following command following exploitation of a machine with

LOWBALL malware to list local groups:

admin@338 actors used the following commands after exploiting a machine with

LOWBALL malware to obtain information about the OS:

admin@338 actors used the following command after exploiting a machine with

LOWBALL malware to acquire information about local networks:

admin@338 actors used the following command following exploitation of a machine with

LOWBALL malware to obtain information about services:

admin@338 actors used the following command following exploitation of a machine with

LOWBALL malware to display network connections:

admin@338 actors used the following commands after exploiting a machine with

LOWBALL malware to obtain information about files and directories:

LOWBALL malware,

admin@338 actors created a file containing a list of commands to be executed on the compromised computer.(Citation: FireEye admin@338)|

Split Documents#

Check token counts on loaded documents

import tiktoken

tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
token_integers = tokenizer.encode(md_docs[0].page_content, disallowed_special=())
num_tokens = len(token_integers)
token_bytes = [tokenizer.decode_single_token_bytes(token) for token in token_integers]

print(f"token count: {num_tokens} tokens")
print(f"token integers: {token_integers}")
print(f"token bytes: {token_bytes}")
token count: 532 tokens
token integers: [2953, 31, 18633, 482, 480, 4119, 23, 271, 11956, 25, 220, 679, 22, 12, 2304, 12, 2148, 51, 1691, 25, 2148, 25, 4331, 13, 24847, 57, 271, 19696, 25, 220, 2366, 15, 12, 2839, 12, 972, 51, 777, 25, 4370, 25, 2946, 13, 4364, 57, 271, 54084, 9663, 25, 350, 1900, 45644, 423, 1339, 16900, 11, 34711, 16777, 10181, 11, 4953, 382, 96309, 271, 2953, 31, 18633, 271, 5116, 271, 2953, 31, 18633, 374, 264, 5734, 6108, 21516, 6023, 1912, 13, 1102, 706, 8767, 1511, 502, 2332, 34594, 4455, 439, 326, 1439, 311, 6493, 40831, 323, 706, 15871, 17550, 11351, 6532, 304, 6020, 11, 7100, 11, 323, 6696, 4947, 11, 11383, 1701, 17880, 2561, 98980, 82, 1778, 439, 52212, 40, 14029, 11, 439, 1664, 439, 1063, 2536, 57571, 1203, 28404, 13, 320, 34, 7709, 25, 6785, 51158, 4074, 31, 18633, 696, 29356, 8467, 12477, 271, 2953, 31, 18633, 706, 3288, 14633, 449, 39270, 5210, 8410, 9477, 12673, 13127, 34, 7709, 25, 6785, 51158, 4074, 31, 18633, 8, 7511, 91, 1800, 265, 12, 21208, 91, 79034, 12, 21208, 91, 47424, 11, 12214, 3204, 11, 13466, 91, 51, 4364, 19, 13, 6726, 91, 30700, 9824, 2958, 44838, 2953, 31, 18633, 706, 17644, 311, 636, 12697, 311, 7195, 39270, 5210, 9506, 34779, 12886, 4669, 41963, 764, 11218, 14633, 13127, 34, 7709, 25, 6785, 51158, 4074, 31, 18633, 8, 7511, 91, 1800, 265, 12, 21208, 91, 79034, 12, 21208, 91, 47424, 11, 13466, 11, 12214, 3204, 91, 51, 4364, 18, 91, 8193, 385, 7709, 369, 8589, 32028, 44838, 2953, 31, 18633, 706, 51763, 3016, 3241, 52227, 369, 11572, 11, 1778, 439, 5210, 9506, 46869, 12, 679, 17, 12, 16037, 23, 13127, 34, 7709, 25, 6785, 51158, 4074, 31, 18633, 8, 7511, 91, 1800, 265, 12, 21208, 91, 79034, 12, 21208, 91, 47424, 11, 12214, 3204, 11, 13466, 91, 51, 6640, 22, 13, 4119, 91, 7469, 8785, 44838, 2953, 31, 18633, 20142, 1511, 279, 2768, 11545, 2768, 40761, 315, 264, 5780, 449, 271, 9628, 79580, 40831, 311, 13555, 1217, 9815, 1473, 2953, 31, 18633, 20142, 1511, 279, 2768, 3290, 311, 30174, 832, 315, 872, 7526, 311, 264, 65309, 1052, 836, 1473, 2953, 31, 18633, 20142, 1511, 279, 2768, 3290, 2768, 40761, 315, 264, 5780, 449, 271, 9628, 79580, 40831, 311, 1160, 2254, 5315, 1473, 2953, 31, 18633, 20142, 1511, 279, 2768, 11545, 1306, 71701, 264, 5780, 449, 271, 9628, 79580, 40831, 311, 6994, 2038, 922, 279, 10293, 1473, 2953, 31, 18633, 20142, 1511, 279, 2768, 3290, 1306, 71701, 264, 5780, 449, 271, 9628, 79580, 40831, 311, 21953, 2038, 922, 2254, 14488, 1473, 2953, 31, 18633, 20142, 1511, 279, 2768, 3290, 2768, 40761, 315, 264, 5780, 449, 271, 9628, 79580, 40831, 311, 6994, 2038, 922, 3600, 1473, 2953, 31, 18633, 20142, 1511, 279, 2768, 3290, 2768, 40761, 315, 264, 5780, 449, 271, 9628, 79580, 40831, 311, 3113, 4009, 13537, 1473, 2953, 31, 18633, 20142, 1511, 279, 2768, 11545, 1306, 71701, 264, 5780, 449, 271, 9628, 79580, 40831, 311, 6994, 2038, 922, 3626, 323, 29725, 1473, 9628, 79580, 40831, 3638, 2953, 31, 18633, 20142, 3549, 264, 1052, 8649, 264, 1160, 315, 11545, 311, 387, 16070, 389, 279, 44500, 6500, 13127, 34, 7709, 25, 6785, 51158, 4074, 31, 18633, 18419]
token bytes: [b'admin', b'@', b'338', b' -', b' G', b'001', b'8', b'\n\n', b'Created', b':', b' ', b'201', b'7', b'-', b'05', b'-', b'31', b'T', b'21', b':', b'31', b':', b'53', b'.', b'579', b'Z', b'\n\n', b'Modified', b':', b' ', b'202', b'0', b'-', b'03', b'-', b'18', b'T', b'19', b':', b'54', b':', b'59', b'.', b'120', b'Z', b'\n\n', b'Contrib', b'utors', b':', b' T', b'ats', b'uya', b' D', b'ait', b'oku', b',', b' Cyber', b' Defense', b' Institute', b',', b' Inc', b'.\n\n', b'Aliases', b'\n\n', b'admin', b'@', b'338', b'\n\n', b'Description', b'\n\n', b'admin', b'@', b'338', b' is', b' a', b' China', b'-based', b' cyber', b' threat', b' group', b'.', b' It', b' has', b' previously', b' used', b' new', b'sw', b'orthy', b' events', b' as', b' l', b'ures', b' to', b' deliver', b' malware', b' and', b' has', b' primarily', b' targeted', b' organizations', b' involved', b' in', b' financial', b',', b' economic', b',', b' and', b' trade', b' policy', b',', b' typically', b' using', b' publicly', b' available', b' RAT', b's', b' such', b' as', b' Poison', b'I', b'vy', b',', b' as', b' well', b' as', b' some', b' non', b'-public', b' back', b'doors', b'.', b' (', b'C', b'itation', b':', b' Fire', b'Eye', b' admin', b'@', b'338', b')\n\n', b'Techn', b'iques', b' Used', b'\n\n', b'admin', b'@', b'338', b' has', b' sent', b' emails', b' with', b' malicious', b' Microsoft', b' Office', b' documents', b' attached', b'.(', b'C', b'itation', b':', b' Fire', b'Eye', b' admin', b'@', b'338', b')', b'|\n', b'|', b'mit', b're', b'-', b'attack', b'|', b'enterprise', b'-', b'attack', b'|', b'Linux', b',', b'mac', b'OS', b',', b'Windows', b'|', b'T', b'120', b'4', b'.', b'002', b'|', b'Mal', b'icious', b' File', b'|\n\n', b'admin', b'@', b'338', b' has', b' attempted', b' to', b' get', b' victims', b' to', b' launch', b' malicious', b' Microsoft', b' Word', b' attachments', b' delivered', b' via', b' spear', b'ph', b'ishing', b' emails', b'.(', b'C', b'itation', b':', b' Fire', b'Eye', b' admin', b'@', b'338', b')', b'|\n', b'|', b'mit', b're', b'-', b'attack', b'|', b'enterprise', b'-', b'attack', b'|', b'Linux', b',', b'Windows', b',', b'mac', b'OS', b'|', b'T', b'120', b'3', b'|', b'Exp', b'lo', b'itation', b' for', b' Client', b' Execution', b'|\n\n', b'admin', b'@', b'338', b' has', b' exploited', b' client', b' software', b' vulnerabilities', b' for', b' execution', b',', b' such', b' as', b' Microsoft', b' Word', b' CVE', b'-', b'201', b'2', b'-', b'015', b'8', b'.(', b'C', b'itation', b':', b' Fire', b'Eye', b' admin', b'@', b'338', b')', b'|\n', b'|', b'mit', b're', b'-', b'attack', b'|', b'enterprise', b'-', b'attack', b'|', b'Linux', b',', b'mac', b'OS', b',', b'Windows', b'|', b'T', b'108', b'7', b'.', b'001', b'|', b'Local', b' Account', b'|\n\n', b'admin', b'@', b'338', b' actors', b' used', b' the', b' following', b' commands', b' following', b' exploitation', b' of', b' a', b' machine', b' with', b'\n\n', b'LOW', b'BALL', b' malware', b' to', b' enumerate', b' user', b' accounts', b':\n\n', b'admin', b'@', b'338', b' actors', b' used', b' the', b' following', b' command', b' to', b' rename', b' one', b' of', b' their', b' tools', b' to', b' a', b' benign', b' file', b' name', b':\n\n', b'admin', b'@', b'338', b' actors', b' used', b' the', b' following', b' command', b' following', b' exploitation', b' of', b' a', b' machine', b' with', b'\n\n', b'LOW', b'BALL', b' malware', b' to', b' list', b' local', b' groups', b':\n\n', b'admin', b'@', b'338', b' actors', b' used', b' the', b' following', b' commands', b' after', b' exploiting', b' a', b' machine', b' with', b'\n\n', b'LOW', b'BALL', b' malware', b' to', b' obtain', b' information', b' about', b' the', b' OS', b':\n\n', b'admin', b'@', b'338', b' actors', b' used', b' the', b' following', b' command', b' after', b' exploiting', b' a', b' machine', b' with', b'\n\n', b'LOW', b'BALL', b' malware', b' to', b' acquire', b' information', b' about', b' local', b' networks', b':\n\n', b'admin', b'@', b'338', b' actors', b' used', b' the', b' following', b' command', b' following', b' exploitation', b' of', b' a', b' machine', b' with', b'\n\n', b'LOW', b'BALL', b' malware', b' to', b' obtain', b' information', b' about', b' services', b':\n\n', b'admin', b'@', b'338', b' actors', b' used', b' the', b' following', b' command', b' following', b' exploitation', b' of', b' a', b' machine', b' with', b'\n\n', b'LOW', b'BALL', b' malware', b' to', b' display', b' network', b' connections', b':\n\n', b'admin', b'@', b'338', b' actors', b' used', b' the', b' following', b' commands', b' after', b' exploiting', b' a', b' machine', b' with', b'\n\n', b'LOW', b'BALL', b' malware', b' to', b' obtain', b' information', b' about', b' files', b' and', b' directories', b':\n\n', b'LOW', b'BALL', b' malware', b',\n\n', b'admin', b'@', b'338', b' actors', b' created', b' a', b' file', b' containing', b' a', b' list', b' of', b' commands', b' to', b' be', b' executed', b' on', b' the', b' compromised', b' computer', b'.(', b'C', b'itation', b':', b' Fire', b'Eye', b' admin', b'@', b'338', b')|']

Create a length function to calculate the min, max, and avg token count across all document

def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=() #To disable this check for all special tokens
    )
    return len(tokens)

# Get token counts
token_counts = [tiktoken_len(doc.page_content) for doc in md_docs]

print(f"""[+] Token Counts:
Min: {min(token_counts)}
Avg: {int(sum(token_counts) / len(token_counts))}
Max: {max(token_counts)}""")
[+] Token Counts:
Min: 176
Avg: 1619
Max: 7346

Use langchain text splitter

from langchain.text_splitter import RecursiveCharacterTextSplitter
# Chunking Text
print('[+] Initializing RecursiveCharacterTextSplitter..')
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,  # number of tokens overlap between chunks
    length_function=tiktoken_len,
    separators=['\n\n', '\n', ' ', '']
)
[+] Initializing RecursiveCharacterTextSplitter..
print('[+] Splitting documents in chunks..')
chunks = text_splitter.split_documents(md_docs)

print(f'[+] Number of documents: {len(md_docs)}')
print(f'[+] Number of chunks: {len(chunks)}')
[+] Splitting documents in chunks..
[+] Number of documents: 134
[+] Number of chunks: 534
print(chunks[0])
print(chunks[1])
page_content='admin@338 - G0018\n\nCreated: 2017-05-31T21:31:53.579Z\n\nModified: 2020-03-18T19:54:59.120Z\n\nContributors: Tatsuya Daitoku, Cyber Defense Institute, Inc.\n\nAliases\n\nadmin@338\n\nDescription\n\nadmin@338 is a China-based cyber threat group. It has previously used newsworthy events as lures to deliver malware and has primarily targeted organizations involved in financial, economic, and trade policy, typically using publicly available RATs such as PoisonIvy, as well as some non-public backdoors. (Citation: FireEye admin@338)\n\nTechniques Used\n\nadmin@338 has sent emails with malicious Microsoft Office documents attached.(Citation: FireEye admin@338)|\n|mitre-attack|enterprise-attack|Linux,macOS,Windows|T1204.002|Malicious File|\n\nadmin@338 has attempted to get victims to launch malicious Microsoft Word attachments delivered via spearphishing emails.(Citation: FireEye admin@338)|\n|mitre-attack|enterprise-attack|Linux,Windows,macOS|T1203|Exploitation for Client Execution|\n\nadmin@338 has exploited client software vulnerabilities for execution, such as Microsoft Word CVE-2012-0158.(Citation: FireEye admin@338)|\n|mitre-attack|enterprise-attack|Linux,macOS,Windows|T1087.001|Local Account|\n\nadmin@338 actors used the following commands following exploitation of a machine with\n\nLOWBALL malware to enumerate user accounts:\n\nadmin@338 actors used the following command to rename one of their tools to a benign file name:\n\nadmin@338 actors used the following command following exploitation of a machine with\n\nLOWBALL malware to list local groups:\n\nadmin@338 actors used the following commands after exploiting a machine with\n\nLOWBALL malware to obtain information about the OS:\n\nadmin@338 actors used the following command after exploiting a machine with\n\nLOWBALL malware to acquire information about local networks:\n\nadmin@338 actors used the following command following exploitation of a machine with\n\nLOWBALL malware to obtain information about services:\n\nadmin@338 actors used the following command following exploitation of a machine with\n\nLOWBALL malware to display network connections:\n\nadmin@338 actors used the following commands after exploiting a machine with' metadata={'source': 'documents\\admin@338.md'}
page_content='LOWBALL malware to obtain information about services:\n\nadmin@338 actors used the following command following exploitation of a machine with\n\nLOWBALL malware to display network connections:\n\nadmin@338 actors used the following commands after exploiting a machine with\n\nLOWBALL malware to obtain information about files and directories:\n\nLOWBALL malware,\n\nadmin@338 actors created a file containing a list of commands to be executed on the compromised computer.(Citation: FireEye admin@338)|' metadata={'source': 'documents\\admin@338.md'}

Contribute Split Documents (Optional)#

We can contribute this so that others can use the data generated so far.

import hashlib

json_documents = []
m = hashlib.md5()
for doc in md_docs:
    doc_name = os.path.basename(doc.metadata['source'])
    m.update(doc_name.encode('utf-8'))
    uid = m.hexdigest()[:12]
    chunks_strings = text_splitter.split_text(doc.page_content)
    for i, chunk in enumerate(chunks_strings):
        # Add JSON object to array
        json_documents.append({
            'id': f'{uid}-{i}',
            'text': chunk,
            'source': doc_name
        })
print(json_documents[0])
print(json_documents[1])
{'id': '4d1ab63e9fd8-0', 'text': 'admin@338 - G0018\n\nCreated: 2017-05-31T21:31:53.579Z\n\nModified: 2020-03-18T19:54:59.120Z\n\nContributors: Tatsuya Daitoku, Cyber Defense Institute, Inc.\n\nAliases\n\nadmin@338\n\nDescription\n\nadmin@338 is a China-based cyber threat group. It has previously used newsworthy events as lures to deliver malware and has primarily targeted organizations involved in financial, economic, and trade policy, typically using publicly available RATs such as PoisonIvy, as well as some non-public backdoors. (Citation: FireEye admin@338)\n\nTechniques Used\n\nadmin@338 has sent emails with malicious Microsoft Office documents attached.(Citation: FireEye admin@338)|\n|mitre-attack|enterprise-attack|Linux,macOS,Windows|T1204.002|Malicious File|\n\nadmin@338 has attempted to get victims to launch malicious Microsoft Word attachments delivered via spearphishing emails.(Citation: FireEye admin@338)|\n|mitre-attack|enterprise-attack|Linux,Windows,macOS|T1203|Exploitation for Client Execution|\n\nadmin@338 has exploited client software vulnerabilities for execution, such as Microsoft Word CVE-2012-0158.(Citation: FireEye admin@338)|\n|mitre-attack|enterprise-attack|Linux,macOS,Windows|T1087.001|Local Account|\n\nadmin@338 actors used the following commands following exploitation of a machine with\n\nLOWBALL malware to enumerate user accounts:\n\nadmin@338 actors used the following command to rename one of their tools to a benign file name:\n\nadmin@338 actors used the following command following exploitation of a machine with\n\nLOWBALL malware to list local groups:\n\nadmin@338 actors used the following commands after exploiting a machine with\n\nLOWBALL malware to obtain information about the OS:\n\nadmin@338 actors used the following command after exploiting a machine with\n\nLOWBALL malware to acquire information about local networks:\n\nadmin@338 actors used the following command following exploitation of a machine with\n\nLOWBALL malware to obtain information about services:\n\nadmin@338 actors used the following command following exploitation of a machine with\n\nLOWBALL malware to display network connections:\n\nadmin@338 actors used the following commands after exploiting a machine with', 'source': 'admin@338.md'}
{'id': '4d1ab63e9fd8-1', 'text': 'LOWBALL malware to obtain information about services:\n\nadmin@338 actors used the following command following exploitation of a machine with\n\nLOWBALL malware to display network connections:\n\nadmin@338 actors used the following commands after exploiting a machine with\n\nLOWBALL malware to obtain information about files and directories:\n\nLOWBALL malware,\n\nadmin@338 actors created a file containing a list of commands to be executed on the compromised computer.(Citation: FireEye admin@338)|', 'source': 'admin@338.md'}

Export Knowledge Base as JSONL File (Optional)

import json

print(f'[+] Exporting groups as .jsonl file..')
with open(f'{os.path.join(contrib_directory, "attack-groups.jsonl")}', 'w') as f:
    for doc in json_documents:
        f.write(json.dumps(doc) + '\n')
[+] Exporting groups as .jsonl file..

Generate Embeddings#

from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")

# load it into Chroma and save it to disk
db = Chroma.from_documents(chunks, embedding_function, collection_name="groups_collection", persist_directory="./chroma_db")
C:\Users\RobertoRodriguez\AppData\Local\Programs\Python\Python311\Lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

ask a question directly to the DB

# query it
query = "What threat actors send text messages to their targets?"
relevant_docs = db.similarity_search(query)

# print results
print(relevant_docs[0].page_content)
Lazarus Group has created new Twitter accounts to conduct social engineering against potential victims.(Citation: Google TAG Lazarus Jan 2021)|
|mitre-attack|enterprise-attack,ics-attack|Linux,macOS,Windows|T1566.003|Spearphishing via Service|

Lazarus Group has used social media platforms, including LinkedIn and Twitter, to send spearphishing messages.(Citation: Google TAG Lazarus Jan 2021)|
|mitre-attack|enterprise-attack,ics-attack|PRE|T1584.004|Server|

Lazarus Group has compromised servers to stage malicious tools.(Citation: Kaspersky ThreatNeedle Feb 2021)|
|mitre-attack|enterprise-attack,ics-attack|PRE|T1591|Gather Victim Org Information|

Lazarus Group has studied publicly available information about a targeted organization to tailor spearphishing efforts against specific departments and/or individuals.(Citation: Kaspersky ThreatNeedle Feb 2021)|
|mitre-attack|enterprise-attack,ics-attack|PRE|T1585.002|Email Accounts|

Lazarus Group has created new email accounts for spearphishing operations.(Citation: Kaspersky ThreatNeedle Feb 2021)|
|mitre-attack|enterprise-attack,ics-attack|PRE|T1588.002|Tool|

Lazarus Group has obtained a variety of tools for their operations, including

Responder and PuTTy PSCP.(Citation: Kaspersky ThreatNeedle Feb 2021)|
|mitre-attack|enterprise-attack,ics-attack|PRE|T1589.002|Email Addresses|

Lazarus Group collected email addresses belonging to various departments of a targeted organization which were used in follow-on phishing campaigns.(Citation: Kaspersky ThreatNeedle Feb 2021)|
|mitre-attack|enterprise-attack,ics-attack|Windows|T1218.011|Rundll32|

Lazarus Group has used rundll32 to execute malicious payloads on a compromised host.(Citation: ESET Twitter Ida Pro Nov 2021)|
|mitre-attack|enterprise-attack,ics-attack|macOS,Windows|T1553.002|Code Signing|