diff --git a/demyst_anonymize.py b/demyst_anonymize.py new file mode 100644 index 00000000..2adde428 --- /dev/null +++ b/demyst_anonymize.py @@ -0,0 +1,47 @@ +import dask.dataframe as dd +import random +import string +import os + +# Import file to generate large dataset +import generate_large_file + +# Function to anonymize a specific column by generating random strings +def anonymize_column(partition, column_name): + # Checks to make sure column type is string, if not throws error + if partition[column_name].dtype != 'object': + raise ValueError(f"Column '{column_name}' is not of type string. Skipping anonymization.") + # Applies anonymize function to dask dataframe by column called, expecting a string value + partition[column_name] = partition[column_name].apply(anonymize, meta=('x', 'str')) + return partition + +# Helper function to generate random strings +def anonymize(x): + # Randomizes the string, with k=10 for randomizing 10 elements + return ''.join(random.choices(string.ascii_uppercase, k=10)) + +def apply_to_data(): + # Gets the current directory, and creates a directory if one does not exist + current_directory = os.getcwd() + # If directory already exists, nothing happens + os.makedirs(current_directory, exist_ok=True) + + # Read the large CSV file using Dask + df = dd.read_csv('generated_large_data.csv') + + # Anonymize specific columns using anonymize function + df['first_name'] = df['first_name'].apply(anonymize, meta=('x', 'str')) + df['last_name'] = df['last_name'].apply(anonymize, meta=('x', 'str')) + df['address'] = df['address'].apply(anonymize, meta=('x', 'str')) + + # Write the anonymized DataFrame back to a new CSV file + output_file_path = os.path.join(current_directory, 'anonymized_file-*.csv') + df.to_csv(output_file_path, single_file=True) + + # Trigger computation to processe and save the data + df.compute() + +if __name__ == '__main__': + apply_to_data() + generate_large_file.generate_random_data() + generate_large_file.generate_file() \ No newline at end of file diff --git a/demyst_fixed_width.py b/demyst_fixed_width.py new file mode 100644 index 00000000..82ace4e0 --- /dev/null +++ b/demyst_fixed_width.py @@ -0,0 +1,63 @@ +# Opens, reads, and closes the spec.json file +with open('spec.json', 'r') as spec_file: + spec_dict = eval(spec_file.read().replace('\n', '')) + +# Creates variables for each entry in spec.json file +column_names = spec_dict["ColumnNames"] +offsets = list(map(int, spec_dict["Offsets"])) # Convert 'offsets' from string type to integers type + # Uses map function to efficiently cast each value of 'offsets' from string type to int type +fixed_width_encoding = spec_dict["FixedWidthEncoding"] +include_header = spec_dict["IncludeHeader"] == "True" +delimited_encoding = spec_dict["DelimitedEncoding"] + +# Function to parse the fixed-width file and write to a CSV file +def parse_fixed_width_file(input_file_path, output_file_path): + + # Opens, reads, and closes each line of the file + with open(input_file_path, 'r', encoding=fixed_width_encoding) as input_file: + lines = input_file.readlines() + + # Opens and closes output file for writing + with open(output_file_path, 'w', encoding=delimited_encoding) as output_file: + + # Write the header if specified + if include_header: + # Adds comma to make the file comma delimited per CSV format desired + output_file.write(','.join(column_names) + '\n') + + # Parses each line based on column 'offsets' + for line in lines: + line = line.rstrip('\n') # Remove any trailing newline + row = [] + current_pos = 0 + for offset in offsets: + # Concatenates input line from current position to current position + the offest value + value = line[current_pos:current_pos + offset].strip() + + # Appends the concatenated value to the row of data + row.append(value) + + # Moves current position to be at the start of the next offset column + current_pos += offset + + # Write the row to the CSV file by adding comma to each entry + output_file.write(','.join(row) + '\n') + +if __name__ == '__main__': + # Standard test + parse_fixed_width_file('input1.txt', 'output1.csv') + + # Values that are larger than width test + parse_fixed_width_file('input2.txt', 'output2.csv') + + # Empty file test + parse_fixed_width_file('input3.txt', 'output3.csv') + + # Single word test + parse_fixed_width_file('input4.txt', 'output4.csv') + + # More than 98 characters test + parse_fixed_width_file('input5.txt', 'output5.csv') + + # Commas test + parse_fixed_width_file('input6.txt', 'output6.csv') \ No newline at end of file diff --git a/generate_large_file.py b/generate_large_file.py new file mode 100644 index 00000000..753a0e54 --- /dev/null +++ b/generate_large_file.py @@ -0,0 +1,34 @@ +import pandas as pd +import random +import string + +# Function to generate random data +def generate_random_data(num_rows): + data = [] + for _ in range(num_rows): + first_name = ''.join(random.choices(string.ascii_uppercase, k=8)) + last_name = ''.join(random.choices(string.ascii_uppercase, k=12)) + address = ''.join(random.choices(string.ascii_uppercase + string.digits + ' ', k=20)) + dob = f'{random.randint(1, 31):02d}-{random.randint(1, 12):02d}-{random.randint(1900, 2000)}' + data.append([first_name, last_name, address, dob]) + return data + +# Number of rows required to create approximately a 2GB CSV file +# num_rows_per_chunk = 500000 +# Generate in chunks of 500k rows +# total_rows = 30000000 +# 30 million rows for ~2GB file + +# Create and write data in chunks +def generate_file(num_rows_per_chunk = 500000, total_rows = 30000000, columns = ['first_name', 'last_name', 'address', 'date_of_birth']): + output_file = 'generated_large_data.csv' + with open(output_file, 'w') as f: + # Write headers to the CSV file + f.write(','.join(columns) + '\n') + + # Generate and write data in chunks + for _ in range(total_rows // num_rows_per_chunk): + data_chunk = generate_random_data(num_rows_per_chunk) + # Write chunk to file + for row in data_chunk: + f.write(','.join(row) + '\n') \ No newline at end of file diff --git a/input1.txt b/input1.txt new file mode 100644 index 00000000..68dd714d --- /dev/null +++ b/input1.txt @@ -0,0 +1,2 @@ +John 25 M NY USA New York 1234 Developer 3000 Manager +Mary 32 F LA USA Los Angeles 5000 Designer 4000 Leader \ No newline at end of file diff --git a/input2.txt b/input2.txt new file mode 100644 index 00000000..5858fc27 --- /dev/null +++ b/input2.txt @@ -0,0 +1,2 @@ +William 250584390584390 ABCD NYNY UnitedStatesofAmerica New York City 12345678910 DeveloperJOBthing12 3000 Manajfkld^^yrewuihfdsklger +Himanshu 32108349083290 WXYZ LALA USofAmerica Los Angeles County 5000 Designerbranded57#$"f 4000000000000000000000000 Leader \ No newline at end of file diff --git a/input3.txt b/input3.txt new file mode 100644 index 00000000..e69de29b diff --git a/input4.txt b/input4.txt new file mode 100644 index 00000000..caa1469d --- /dev/null +++ b/input4.txt @@ -0,0 +1 @@ +SINGLEWORDTESTFILE \ No newline at end of file diff --git a/input5.txt b/input5.txt new file mode 100644 index 00000000..a82fce38 --- /dev/null +++ b/input5.txt @@ -0,0 +1 @@ +jfdklsafjtre98t 74895 7r384r &*(%&v* (%)_%*#U49T04UFJILDJG *)&#() UREIWPTUFOPI9-EWU80 U80@#$%^&*&^%$#$%^&*( UISDUF JDISUFJEDITOUEWRGJORIUFJGTR8GUEW79GU89RFEIYGHJ8RIOG ITKPO9T7493847 39 EUW89TYWE79WE \ No newline at end of file diff --git a/input6.txt b/input6.txt new file mode 100644 index 00000000..8eb5a047 --- /dev/null +++ b/input6.txt @@ -0,0 +1 @@ +,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, \ No newline at end of file