5G_slice_mobility_security/NRves2csv.py at main · gtri/5G_slice_mobility_security · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
'''
Process the raw JSON from the VES collector into a
dataframe and save to a csv. That csv can be loaded
from the NRDataset.py file to make the torch dataset.
'''
#TODO: Add a data-alignment function to combine ves measurments with Amarisoft UE data

import sys
import time
import pandas as pd
import json
from pathlib import Path
import numpy as np
from sklearn import preprocessing
import argparse
from functools import reduce
from datetime import datetime
import pytz

def get_ves_labels(ves_events):
    '''
    Iterates through the first ~1000 ves messages to get a complete list of labels.
    The measurments for a single timestamp come in a few chunks so dynamically aquiring labels
    has to be done over a sufficient set of items in the JSON object pulled from the message router.
    '''
    # Timestamp is the index of each row
    feature_labels = ['timestamp']
    event_set = ves_events[:1000]
    for event in event_set:
        for objectInstance in event['event']['measurementsForVfScalingFields']['additionalObjects']:
            # print(objectInstance)
            # List of all top level metrics
            for object_ in objectInstance['objectInstances']:
                feature_label = objectInstance['objectName'] + ' ' + object_['objectKeys'][0]['keyValue']
                feature_labels.append(feature_label)
    return feature_labels


def standardize_timecodes(timestamp: str):
    # Parse the month, day, year, time-of-day into a datetime object then UTC timestamp
    time_list = timestamp.split()
    month = int(time_list[1])
    day = int(time_list[2])
    year = int(time_list[3])
    tod = time_list[4]
    hour = int(time_list[4].split(':')[0]) #+ 12 # This is an unfortunate assumption for measurments taken after noon...
    minute = int(time_list[4].split(':')[1])
    second = int(time_list[4].split(':')[2])

    time = datetime(year, month, day, hour, minute, second, tzinfo=pytz.timezone('utc'))
    timestamp = time.timestamp()
    return timestamp


def ves2csv(path: Path, mal_id: int = 0):
    '''
    Process the JSON into a dataframe

    json: VES events
    path: name of the json file - will also be name of saved csv
    mal_id: The benign/malicious label for this set of samples (0: benign, 1: malicious)
    '''
    # Load data from json
    data = json.loads(path.read_text(encoding="UTF-8"))
    # Static labels and data structures
    feature_labels = get_ves_labels(data)
    # print(feature_labels)
    data_dict = {key:[] for key in feature_labels}

    # Loop through collected events
    for event in data:
        # Process the timecode into a UTC timestamp
        collector_timestamp = event['event']['commonEventHeader']['internalHeaderFields']['collectorTimeStamp']
        data_dict['timestamp'] += [standardize_timecodes(collector_timestamp)]
        for objectInstance in event['event']['measurementsForVfScalingFields']['additionalObjects']:
            # List of all top level metrics
            for object_ in objectInstance['objectInstances']:
                feature_label = objectInstance['objectName'] + ' ' + object_['objectKeys'][0]['keyValue']
                data_dict[feature_label] += object_['objectInstance'].values()
                # feature_value = object_['objectInstance']
        for key, values in data_dict.items():
            if len(values) < len(data_dict['timestamp']):
                data_dict[key].append('NaN')

    # Process the data to consolidate samples from the same time into a single entry
    df = pd.DataFrame.from_dict(data_dict)
    df = df.replace('NaN', None)
    df = df.infer_objects(copy=False).fillna(0)
    df = df.groupby(['timestamp']).sum()
    # Add benign/malicious labels
    df['label'] = [mal_id] * len(df.iloc[:, 0])
    # Write to csv
    # df.to_csv('data/' + path.stem + '.csv')
    df.head()
    return df


def combine_csv(frames: list, saveFile: str = 'data/NR_dataset.csv'):
    # Combines a list of dataframes into a single dataframe and saves to csv
    df_full = pd.concat(frames)
    df_full.to_csv(saveFile)
    return df_full


def split_slices(df, slice_labels, target_slice_sd):
    '''
    Function to split VES samples into a distinct sample for each slice

    ves2csv makes a dataset where each VES event is one sample. Each VES event
    contains data from all the slices on the core network. This function takes
    the SNSSAI and corresponding UPF IP of each slice and splits each sample into
    a set of samples where each only contains data for one slice. Think of this
    as a full core dataset -> slice specific dataset conversion.

    slice_labels: List of tuples. Each tuple should contain the snssai of a slice and corresponding upf ip addr
    Ex: [(1-111111, 10.1.0.138:9090)]
    '''
    columns = df.columns
    slice_dataframes = []
    for label in slice_labels:
        target_columns = []
        for col in columns:
            if any(x in col for x in label): # | (not any(x in col for x in list(sum(slice_labels, ())) )):
                target_columns.append(col)
        # Select the target columns based on slice id
        df_subset = df.loc[:,target_columns]
        if target_slice_sd in label[0]:
            # Malicious labels
            # df_subset['label'] = df['label']
            df_subset.loc[:,'label'] = df['label']
        else:
            # df_subset['label'] = df['label'].values[:] * 0
            df_subset.loc[:,'label'] = df.loc[:,'label'] * 0
        # Strip the slice id from column names
        col_names = {col: col.split()[0] if any(x in col for x in list(sum(slice_labels, ())) ) else col for col in target_columns}
        df_subset.rename(columns=col_names, inplace=True)
        # Add a label column to keep track of the slice id
        df_subset['slice_id'] = label[0]
        slice_dataframes.append(df_subset)

    # Verically stack the component dataframes
    df_slices = pd.concat(slice_dataframes, axis=0)
    return df_slices


def modify_labels(df, tup, tdown):
    '''
    Technically the attack is in process while the UEs swing off of the target slice, but I want
    to test the detector when load is being applied to the target slice separately. The detector
    is really struggling to identify an attack on individual slices currently and I think this
    might be the reason. The attack signature doesn't show up on individual slices when they
    aren't under load.
    '''
    # Set appropriate start value - some of the data files have a few random samples at the start that were in the buffer before collection began
    # We need to skip those samples
    mal_sample_indices = df.index[df['label'] == 1].tolist()
    for i in mal_sample_indices:
        start_time = int(df['timestamp'][i])
        time_diff_error = int(df['timestamp'][i+20]) - (start_time + 100)
        if time_diff_error < 5:
            break
    sim_duration = 1800
    time_elapsed = 0
    # Iterate through timestamps to set the malicious times to correspond with the attack timing
    while sim_duration > time_elapsed:
        time_range = range(start_time, start_time + tdown)
        indicies = [index for index, timestamp in enumerate(df['timestamp']) if timestamp in time_range]
        # if index is in the time range set the label to 0
        df.loc[indicies, 'label'] = 0
        time_elapsed = time_elapsed + tup + tdown
        start_time = start_time + tup + tdown

    df.to_csv('slice-dataset-mitigated-ban-mod-labels.csv')
    return df


def make_ue_df(ue_sample_dir: Path, data_filter: list = None, mal_ue_list: list = [], malicious: bool = False):
    '''
    UE samples are collected across separate directories. The sub-directories
    are for different stats and the main directories are for different UE IMSIs.

    ue_sample_dir: The main directory containing all the UE files
    data_filter: A list of strings that you can use to filter for only the date you're interested in
    mal_ue_list: list of malicious UE IMSIs for labeling
    malicious: Bool value used to set the UEs in mal_ue_list to have a label of 1
    '''
    total_df = pd.DataFrame()

    # Iterate through all UE folders
    for pth in ue_sample_dir.iterdir():
        imsi = pth.stem.split('-')[1]
        # Iterate through each data file in a UE folder
        for p in pth.iterdir():
            # Filter data if specified
            if data_filter:
                df_list = [pd.read_csv(p) for p in pth.iterdir() if any(x in str(p) for x in data_filter)]
            else:
                df_list = [pd.read_csv(p) for p in pth.iterdir()]

            # Round epochs to nearest second for consistency between UE files
            concat_matching_dfs = {}
            for df in df_list:
                df.epoch = df.epoch.round()
                if str(df.columns) in concat_matching_dfs:
                    concat_matching_dfs[str(df.columns)].append(df)
                else:
                    concat_matching_dfs[str(df.columns)] = [df]

            df_list = [pd.concat(frames, axis=0, ignore_index=True) for frames in concat_matching_dfs.values()]
            # Awful lambda function to merge a list of dataframes
            df_output = reduce(lambda  left,right: pd.merge(left,right,on=['epoch'],how='outer'), df_list).fillna(0)
            df_output['imsi'] = imsi

        # Stack data for each UE
        total_df = pd.concat([total_df,df_output], axis=0, ignore_index=True)

    # Add label and rename epoch to timestamp
    total_df['label'] = 0
    total_df = total_df.rename(columns={"epoch": "timestamp"}).fillna(0)
    # Set the malicious UEs to label 1
    if malicious:
        for imsi in mal_ue_list:
            total_df.loc[total_df['imsi'] == str(imsi), 'label'] = 1
    # Save the file to the data dir
    benign_or_malicious = ue_sample_dir.stem.split('-')[1]
    return total_df


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Process raw ves messages and UE data samples into csv datasets.')
    parser.add_argument('ves_files_list', metavar='Benign File', type=Path, nargs=1,
                    help='A json file containing all the data files and keys indicating if the data is malicious and what slice is the target')
    parser.add_argument('benign_ue_dir', metavar='Benign UE dir.', type=Path, nargs=1,
                    help='Directory where benign UE data is.')
    parser.add_argument('malicious_ue_dir', metavar='Malicious UE dir.', type=Path, nargs=1,
                    help='Directory where malicious UE data is.')
    parser.add_argument('save_file_core', metavar='Save file for core data.', type=Path, nargs=1,
                    help='Output save file.')
    parser.add_argument('save_file_slice', metavar='Save file for slice data.', type=Path, nargs=1,
                    help='Output save file.')
    parser.add_argument('save_file_ue', metavar='Save file for ue data.', type=Path, nargs=1,
                    help='Output save file.')
    args = parser.parse_args()

    # Inputs
    core_data = Path(sys.argv[1])
    bengin_ue_dir = Path(sys.argv[2])
    malicious_ue_dir = Path(sys.argv[3])
    save_file_core = Path(sys.argv[4])
    save_file_slice = Path(sys.argv[5])
    save_file_ue = Path(sys.argv[6])

    # Core Dataframes
    core_data = json.loads(core_data.read_text(encoding="UTF-8"))
    ves_files = list(core_data.keys())
    mal_ids = [list(targets.values())[0] for targets in core_data.values()]
    target_slice_sds = [list(targets.values())[1] for targets in core_data.values()]
    core_frames = [ves2csv(Path(path), mal_id) for path, mal_id in zip(ves_files, mal_ids)]
    # Combined core data
    core_df = combine_csv(core_frames, save_file_core)

    # Slice Dataframe
    slice_labels = [('1-111111','10.1.0.138:9090'), ('1-222222','10.1.0.201:9090'), ('2-333333','10.1.0.228:9090'), ('2-444444','10.1.0.33:9090'), ('3-555555','10.1.0.76:9090'), ('3-666666','10.1.0.232:9090')]
    slice_frames = [split_slices(core_frame, slice_labels, target_slice_sd) for core_frame, target_slice_sd in zip(core_frames, target_slice_sds)]
    slice_df = combine_csv(slice_frames, save_file_slice)

    # UE Dataframe
    benign_data_filter = ['09-24']
    benign_ue_df = make_ue_df(bengin_ue_dir, benign_data_filter)
    malicious_data_filter = ['09-24']
    mal_ue_imsis = [999700000000001, 999700000000003, 999700000000011, 999700000000012, 999700000000013, 999700000000015, 999700000000016, 999700000000017, 999700000000019, 999700000000024, 999700000000025,
    999700000000027, 999700000000032, 999700000000033, 999700000000035, 999700000000038, 999700000000041, 999700000000043, 999700000000045, 999700000000047, 999700000000050, 999700000000054, 999700000000055,
    999700000000069, 999700000000070, 999700000000071, 999700000000079, 999700000000081, 999700000000093, 999700000000111, 999700000000117, 999700000000118, 999700000000122, 999700000000123, 999700000000126,
    999700000000128, 999700000000131, 999700000000144, 999700000000146, 999700000000148, 999700000000158, 999700000000159, 999700000000170, 999700000000179]
    malicious_ue_df = make_ue_df(malicious_ue_dir, malicious_data_filter, mal_ue_list = mal_ue_imsis, malicious = True)

    full_ue_df = combine_csv([benign_ue_df, malicious_ue_df], saveFile=save_file_ue)