forked from CoachCoe/NuHarborHackathon
-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstep5_concat_UUID_Summary_to_All-CS_All-MS.py
87 lines (68 loc) · 2.73 KB
/
step5_concat_UUID_Summary_to_All-CS_All-MS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import json
import pandas as pd
import re
filepath = r'hackathon-solution\Key.xlsx'
json_files = [
r'hackathon-solution\All-MS\All-MS.json',
r'hackathon-solution\All-CS\All-CS.json',
]
output_prefixes = [
'hackathon-solution/All-MS/',
'hackathon-solution/All-CS/',
]
# Read the Excel file
df = pd.read_excel(filepath)
def remove_irrelevant_items(data):
irrelevant_keys = ['UUID', 'GPT Summary', 'AlertID', 'AlertTimestamp', 'Affected_Device', 'Affected_User', 'event_id', 'event_timestamp', 'event_time', 'endpoint_id', 'host', 'Hostname', 'IP', 'OS', 'User', 'destination_port', 'destination_ip']
if isinstance(data, dict):
for key in irrelevant_keys:
if key in data:
del data[key]
for value in data.values():
remove_irrelevant_items(value)
elif isinstance(data, list):
for item in data:
remove_irrelevant_items(item)
return data
def clean_value(value):
# Remove any non-alphanumeric characters from the value
return re.sub('[^0-9a-zA-Z]+', '', str(value))
def add_uuid(obj, df):
raw_string = json.dumps(obj)
for index, row in df.iterrows():
max_uuid = None
max_summary = None
for column in ['Microsoft', 'CrowdStrike']:
current_value = row[column]
if current_value is not None and not pd.isnull(current_value):
current_value_clean = clean_value(current_value)
raw_string_clean = clean_value(raw_string)
if current_value_clean.lower() in raw_string_clean.lower():
max_uuid = row['UUID']
max_summary = row['Summary']
break
if max_uuid is not None:
obj["UUID"] = max_uuid
obj["Summary"] = max_summary
break
def process_json_file(json_file, output_prefix):
output_file = output_prefix + 'UUID_Summary.json'
# Read the JSON file line by line and process each JSON object
with open(json_file, 'r') as f:
all_data = []
for line in f:
data = json.loads(line.strip())
# Remove irrelevant keys
data = remove_irrelevant_items(data)
# Add UUID and Summary
add_uuid(data, df)
# Append the updated JSON object to the all_data list
all_data.append(data)
# Save the dataset to a new file
with open(output_file, 'w') as f:
for data in all_data:
json.dump(data, f)
f.write('\n')
# Process each JSON file
for json_file, output_prefix in zip(json_files, output_prefixes):
process_json_file(json_file, output_prefix)