Download this jupyter notebook for an interactive experience.

Python Environment

For ssh tunelling: ssh -i (private key file) -L 4367:localhost:4367 ec2-user@111.222.333.444

start by importing xgt module and creating a connection object.

import xgt
conn = xgt.Connection()

Are we running and connected via SSH tunneling?

conn.server_version
'0.17.0'

Define vertex and Netflow edge schema

conn.drop_frame('Netflow')
conn.drop_frame('Wls1v')
conn.drop_frame('Devices')
dev = conn.create_vertex_frame(name = 'Devices',
                               schema = [['Device', xgt.TEXT]],
                               key = 'Device')

nf = conn.create_edge_frame(name = 'Netflow',
                            schema = [['epochtime',xgt.INT],
                                      ['duration',xgt.INT],
                                      ['srcDevice',xgt.TEXT],
                                      ['dstDevice',xgt.TEXT],
                                      ['protocol',xgt.INT],
                                      ['srcPort',xgt.INT],
                                      ['dstPort',xgt.INT],
                                      ['srcPackets',xgt.INT],
                                      ['dstPackets',xgt.INT],
                                      ['srcBytes',xgt.INT],
                                      ['dstBytes',xgt.INT]],
                            source = 'Devices',
                            target = 'Devices',
                            source_key = 'srcDevice',
                            target_key = 'dstDevice')

Another edge type schema

wls1v = conn.create_edge_frame(name = 'Wls1v',
                               schema =[['epochtime',xgt.INT],
                                        ['eventID',xgt.INT],
                                        ['logHost',xgt.TEXT],
                                        ['userName',xgt.TEXT],
                                        ['domainName',xgt.TEXT],
                                        ['logonID',xgt.INT],
                                        ['processName',xgt.TEXT],
                                        ['processID',xgt.INT],
                                        ['parentProcessName',xgt.TEXT],
                                        ['parentProcessID',xgt.INT]],
                               source = 'Devices',
                               target = 'Devices',
                               source_key = 'logHost',
                               target_key = 'logHost')

Load server log data

import time
start_time = time.time()

urls = []
for i in range(4,5):
    urls.append("http://datasets.trovares.com/LANL/xgt/wls_day-%02d_1v.csv" % int(i))
wls1v.load(urls)

end_time = time.time()
print("Wsl load time: %.3f seconds" % (end_time - start_time))
Wsl load time: 26.505 seconds

Load netflow data

import time
start_time = time.time()

urls = []
for i in range(4,5):
    urls.append("http://datasets.trovares.com/LANL/xgt/nf_day-%02d.csv" % int(i))
nf.load(urls)

end_time = time.time()
print("Netflow load time: %.3f seconds" % (end_time - start_time))
Netflow load time: 254.100 seconds

How much data did we load?

num_devices = dev.num_vertices
num_netflow = nf.num_edges
num_wls_1v = wls1v.num_edges
print("Rows of Devices data: " + "{:,}".format(num_devices))
print("Rows of Netflow data: " + "{:,}".format(num_netflow))
print("Rows of Wls 1V data: " + "{:,}".format(num_wls_1v))
Rows of Devices data: 157,949
Rows of Netflow data: 222,323,503
Rows of Wls 1V data: 16,402,438

Graph Pattern of interest

Zombie Reboot

Search for just a boot event

q = """
MATCH (dev1)-[boot:Wls1v]->(dev1)
WHERE boot.eventID = 4608
RETURN COUNT(*)
INTO answers
"""

def run_query_count(query):
    conn.drop_frame('answers')
    start_time = time.time()
    conn.run_job(query)
    end_time = time.time()
    print("Query time: %.3f seconds" % (end_time - start_time))

    # Retrieve count
    table = conn.get_table_frame('answers')
    count = table.get_data()[0][0]
    return count

count = run_query_count(q)
print('Number of boot events: ' + "{:,}".format(count))
Query time: 0.329 seconds
Number of boot events: 1,712

Boot event followed by Program Start event

q = """
MATCH (dev1)-[boot:Wls1v]->(dev1)-[ps:Wls1v]->(dev1)
WHERE boot.eventID = 4608
  AND ps.eventID = 4688
  AND ps.epochtime >= boot.epochtime
  AND ps.epochtime - boot.epochtime < 4
RETURN COUNT(*)
INTO answers
"""

count = run_query_count(q)
print('Number of boot==>programstart events: ' + "{:,}".format(count))
Query time: 0.921 seconds
Number of boot==>programstart events: 253,009

Boot, PS, and C2-connect

q = """
MATCH (dev2)-[nf0]->(dev1)-[boot:Wls1v]->(dev1)-[ps:Wls1v]->(dev1)
WHERE dev1 <> dev2
  AND nf0.srcPort = 3128
  AND boot.eventID = 4608
  AND ps.eventID = 4688
  AND ps.epochtime >= boot.epochtime
  AND nf0.epochtime >= ps.epochtime
  AND nf0.epochtime - boot.epochtime <= 3
RETURN COUNT(*)
INTO answers
"""
# Note the overall time limit on the sequence of the three events

count = run_query_count(q)
print('Number of boot==>programstart==>nf0 events: ' + "{:,}".format(count))
Query time: 4.164 seconds
Number of boot==>programstart==>nf0 events: 109

Zombie Reboot

q = """
MATCH (dev2)-[nf0]->(dev1)-[boot:Wls1v]->(dev1)-[ps:Wls1v]->(dev1),
      (dev2)-[nf1]->(dev3)
WHERE dev1 <> dev2 AND dev2 <> dev3 AND dev1 <> dev3
  AND nf0.srcPort = 3128
  AND boot.eventID = 4608
  AND ps.eventID = 4688
  AND ps.epochtime >= boot.epochtime
  AND nf0.epochtime >= ps.epochtime
  AND nf0.epochtime - boot.epochtime <= 3
  AND nf1.duration >= 3600
  AND nf1.epochtime < nf0.epochtime
  AND nf1.epochtime + nf1.duration >= nf0.epochtime
RETURN COUNT(*)
INTO answers
"""

count = run_query_count(q)
print('Number of Zombie Reboot events: ' + "{:,}".format(count))
Query time: 16.352 seconds
Number of Zombie Reboot events: 981

Discussion