diff --git a/data/cleaned_addresses.csv b/data/cleaned_addresses.csv index b2850f5f30e7a0ba090e97b7fb4f6e910bdd0c2c..a19ed344679cbbd8129e98a21d23fbb74ca9a269 100644 --- a/data/cleaned_addresses.csv +++ b/data/cleaned_addresses.csv @@ -1,4 +1,4 @@ -address id,customer id,country,city,state,postal code,region +address_id,customer_id,country,city,state,postal_code,region CG-12520_42420,CG-12520,United States,Henderson,Kentucky,42420,South DV-13045_90036,DV-13045,United States,Los Angeles,California,90036,West SO-20335_33311,SO-20335,United States,Fort Lauderdale,Florida,33311,South diff --git a/data/cleaned_customers.csv b/data/cleaned_customers.csv index 1c9e2ca30e51401898535f4de195c6c32c0dd049..2ee0722e52d230f1b2bc3e583be543938eeb6cee 100644 --- a/data/cleaned_customers.csv +++ b/data/cleaned_customers.csv @@ -1,4 +1,4 @@ -customer id,customer name,segment +customer_id,customer_name,segment CG-12520,Claire Gute,Consumer DV-13045,Darrin Van Huff,Corporate SO-20335,Sean O'Donnell,Consumer diff --git a/data/cleaned_departments.csv b/data/cleaned_departments.csv index 10e7a882bb31e8bc15d3b9923d30e6e33bcc577f..11e9d63e0388a61207d63e8c47403ea61428ee8b 100644 --- a/data/cleaned_departments.csv +++ b/data/cleaned_departments.csv @@ -1,4 +1,4 @@ -department,city,storage capacity +department,city,storage_capacity Furniture,New York City,3003 Office Supplies,San Francisco,4043 Technology,Philadelphia,2257 diff --git a/data/cleaned_order_details.csv b/data/cleaned_order_details.csv index 6e16d92fbd731ffdffd6c82982a887bb859a2a00..f2a20c57efd049cd58155afac1caf57072dc68f2 100644 --- a/data/cleaned_order_details.csv +++ b/data/cleaned_order_details.csv @@ -1,4 +1,4 @@ -order id,product id,sales,quantity,profit +order_id,product_id,sales,quantity,profit CA-2016-152156,FUR-BO-10001798,261.96,2,41.91 CA-2016-152156,FUR-CH-10000454,731.94,3,219.58 CA-2016-138688,OFF-LA-10000240,14.62,2,6.87 diff --git a/data/cleaned_orders.csv b/data/cleaned_orders.csv index 6d46909f5ccb0838e66a3cf967dc087b3d49cae8..2ac260f69aad12b998a0b527a000fd56fcc4ae59 100644 --- a/data/cleaned_orders.csv +++ b/data/cleaned_orders.csv @@ -1,4 +1,4 @@ -order id,order date,ship date,customer id,address id +order_id,order_date,ship_date,customer_id,address_id CA-2016-152156,2021-11-02,2021-11-05,CG-12520,CG-12520_42420 CA-2016-138688,2021-06-06,2021-06-10,DV-13045,DV-13045_90036 US-2015-108966,2020-10-04,2020-10-11,SO-20335,SO-20335_33311 diff --git a/data/cleaned_products.csv b/data/cleaned_products.csv index 7348de4650e732b5dea90546969efc1fcf090943..b926c1542b72c859d4d40f6eed856067b610ebe7 100644 --- a/data/cleaned_products.csv +++ b/data/cleaned_products.csv @@ -1,4 +1,4 @@ -product id,department,sub-category,product name +product_id,department,sub-category,product_name FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back" OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters by Universal diff --git a/src/DataCleaner.py b/src/DataCleaner.py index 9fef7a8ca1901233feff83646b85a6d838e9087b..17f9dd5a745c0b1a1705a912e2f5966ae45c4fc8 100644 --- a/src/DataCleaner.py +++ b/src/DataCleaner.py @@ -4,41 +4,38 @@ files = {} dfs = {} fileNames = ["addresses", "customers", "departments", "order_details", "orders", "products"] - def load_file(file_path): df = pd.read_csv(file_path) files[file_path] = df return df - def sanitize_data(df): - # dropping duplicates df = df.drop_duplicates() - # drop missing values df = df.dropna() - # renaming columns to lowercase - df.columns = [col.lower() for col in df.columns] + # on retire les espaces et on met tout en minuscule + df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns] return df - +# Charger et nettoyer for name in fileNames: try: - # importing files df = load_file("../data/" + name + ".csv") - dfs[name] = df - print(f"Loaded {name}") - # cleaning data dfs[name] = sanitize_data(df) - print(f"Sanitized {name}") + print(f"Loaded and sanitized {name}") except Exception as e: print(f"Failed to load {name} : {e}") -# linking tables +# Fusionner les tables linked_dfs = [] -links = [['customers', 'orders', 'customer id'], ['orders', 'order_details', 'order id'], - ['order_details', 'products', 'product id'], ['departments', 'products', 'department'], - ['orders', 'addresses', 'address id'], ['orders', 'order_details', 'order id']] +# ⬇Remarque : colonnes en minuscules et underscore +links = [ + ['orders', 'order_details', 'order_id'], + ['products', 'order_details', 'product_id'], + ['departments', 'order_details', 'department'], + ['customers', 'orders', 'customer_id'], + ['addresses', 'orders', 'address_id'] +] def link_data(df1, df2, key): try: @@ -48,13 +45,20 @@ def link_data(df1, df2, key): print(f"Failed to link DataFrames on {key}: {e}") return None - +# Lancer les jointures for link in links: - linked_dfs.append(link_data(dfs[link[0]], dfs[link[1]], link[2])) - print(f"Linked {link[0]} and {link[1]} on {link[2]}") - - -# saving cleaned and linked data + try: + df1 = dfs[link[0]] + df2 = dfs[link[1]] + key = link[2] + linked_df = link_data(df1, df2, key) + if linked_df is not None: + linked_dfs.append(linked_df) + print(f"Linked {link[0]} and {link[1]} on {key}") + except KeyError as e: + print(f"Missing key in dfs: {e}") + +# Sauvegarde des fichiers nettoyés def save_data(df, file_name): try: df.to_csv("../data/" + file_name, index=False) @@ -62,6 +66,6 @@ def save_data(df, file_name): except Exception as e: print(f"Failed to save {file_name}: {e}") - +# Sauvegarde des fichiers nettoyés for name, df in dfs.items(): save_data(df, f"cleaned_{name}.csv")