From 1d90c6f34827925c1dd7bd4355cbaf631417372b Mon Sep 17 00:00:00 2001
From: Nasmat Arouna <arounanasmat@gmail.com>
Date: Tue, 27 May 2025 16:56:56 +0200
Subject: [PATCH] =?UTF-8?q?R=C3=A9organisation=20des=20liens=20de=20la=20t?=
 =?UTF-8?q?able=20de=20faits?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 data/cleaned_addresses.csv     |  2 +-
 data/cleaned_customers.csv     |  2 +-
 data/cleaned_departments.csv   |  2 +-
 data/cleaned_order_details.csv |  2 +-
 data/cleaned_orders.csv        |  2 +-
 data/cleaned_products.csv      |  2 +-
 src/DataCleaner.py             | 50 ++++++++++++++++++----------------
 7 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/data/cleaned_addresses.csv b/data/cleaned_addresses.csv
index b2850f5..a19ed34 100644
--- a/data/cleaned_addresses.csv
+++ b/data/cleaned_addresses.csv
@@ -1,4 +1,4 @@
-address id,customer id,country,city,state,postal code,region
+address_id,customer_id,country,city,state,postal_code,region
 CG-12520_42420,CG-12520,United States,Henderson,Kentucky,42420,South
 DV-13045_90036,DV-13045,United States,Los Angeles,California,90036,West
 SO-20335_33311,SO-20335,United States,Fort Lauderdale,Florida,33311,South
diff --git a/data/cleaned_customers.csv b/data/cleaned_customers.csv
index 1c9e2ca..2ee0722 100644
--- a/data/cleaned_customers.csv
+++ b/data/cleaned_customers.csv
@@ -1,4 +1,4 @@
-customer id,customer name,segment
+customer_id,customer_name,segment
 CG-12520,Claire Gute,Consumer
 DV-13045,Darrin Van Huff,Corporate
 SO-20335,Sean O'Donnell,Consumer
diff --git a/data/cleaned_departments.csv b/data/cleaned_departments.csv
index 10e7a88..11e9d63 100644
--- a/data/cleaned_departments.csv
+++ b/data/cleaned_departments.csv
@@ -1,4 +1,4 @@
-department,city,storage capacity
+department,city,storage_capacity
 Furniture,New York City,3003
 Office Supplies,San Francisco,4043
 Technology,Philadelphia,2257
diff --git a/data/cleaned_order_details.csv b/data/cleaned_order_details.csv
index 6e16d92..f2a20c5 100644
--- a/data/cleaned_order_details.csv
+++ b/data/cleaned_order_details.csv
@@ -1,4 +1,4 @@
-order id,product id,sales,quantity,profit
+order_id,product_id,sales,quantity,profit
 CA-2016-152156,FUR-BO-10001798,261.96,2,41.91
 CA-2016-152156,FUR-CH-10000454,731.94,3,219.58
 CA-2016-138688,OFF-LA-10000240,14.62,2,6.87
diff --git a/data/cleaned_orders.csv b/data/cleaned_orders.csv
index 6d46909..2ac260f 100644
--- a/data/cleaned_orders.csv
+++ b/data/cleaned_orders.csv
@@ -1,4 +1,4 @@
-order id,order date,ship date,customer id,address id
+order_id,order_date,ship_date,customer_id,address_id
 CA-2016-152156,2021-11-02,2021-11-05,CG-12520,CG-12520_42420
 CA-2016-138688,2021-06-06,2021-06-10,DV-13045,DV-13045_90036
 US-2015-108966,2020-10-04,2020-10-11,SO-20335,SO-20335_33311
diff --git a/data/cleaned_products.csv b/data/cleaned_products.csv
index 7348de4..b926c15 100644
--- a/data/cleaned_products.csv
+++ b/data/cleaned_products.csv
@@ -1,4 +1,4 @@
-product id,department,sub-category,product name
+product_id,department,sub-category,product_name
 FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase
 FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back"
 OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters by Universal
diff --git a/src/DataCleaner.py b/src/DataCleaner.py
index 9fef7a8..17f9dd5 100644
--- a/src/DataCleaner.py
+++ b/src/DataCleaner.py
@@ -4,41 +4,38 @@ files = {}
 dfs = {}
 fileNames = ["addresses", "customers", "departments", "order_details", "orders", "products"]
 
-
 def load_file(file_path):
     df = pd.read_csv(file_path)
     files[file_path] = df
     return df
 
-
 def sanitize_data(df):
-    # dropping duplicates
     df = df.drop_duplicates()
-    # drop missing values
     df = df.dropna()
-    # renaming columns to lowercase
-    df.columns = [col.lower() for col in df.columns]
+    # on retire les espaces et on met tout en minuscule
+    df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]
     return df
 
-
+# Charger et nettoyer
 for name in fileNames:
     try:
-        # importing files
         df = load_file("../data/" + name + ".csv")
-        dfs[name] = df
-        print(f"Loaded {name}")
-        # cleaning data
         dfs[name] = sanitize_data(df)
-        print(f"Sanitized {name}")
+        print(f"Loaded and sanitized {name}")
     except Exception as e:
         print(f"Failed to load {name} : {e}")
 
-# linking tables
+# Fusionner les tables
 linked_dfs = []
-links = [['customers', 'orders', 'customer id'], ['orders', 'order_details', 'order id'],
-         ['order_details', 'products', 'product id'], ['departments', 'products', 'department'],
-         ['orders', 'addresses', 'address id'], ['orders', 'order_details', 'order id']]
 
+# ⬇Remarque : colonnes en minuscules et underscore
+links = [
+    ['orders', 'order_details', 'order_id'],
+    ['products', 'order_details', 'product_id'],
+    ['departments', 'order_details', 'department'],
+    ['customers', 'orders', 'customer_id'],
+    ['addresses', 'orders', 'address_id']
+]
 
 def link_data(df1, df2, key):
     try:
@@ -48,13 +45,20 @@ def link_data(df1, df2, key):
         print(f"Failed to link DataFrames on {key}: {e}")
         return None
 
-
+# Lancer les jointures
 for link in links:
-    linked_dfs.append(link_data(dfs[link[0]], dfs[link[1]], link[2]))
-    print(f"Linked {link[0]} and {link[1]} on {link[2]}")
-
-
-# saving cleaned and linked data
+    try:
+        df1 = dfs[link[0]]
+        df2 = dfs[link[1]]
+        key = link[2]
+        linked_df = link_data(df1, df2, key)
+        if linked_df is not None:
+            linked_dfs.append(linked_df)
+            print(f"Linked {link[0]} and {link[1]} on {key}")
+    except KeyError as e:
+        print(f"Missing key in dfs: {e}")
+
+# Sauvegarde des fichiers nettoyés
 def save_data(df, file_name):
     try:
         df.to_csv("../data/" + file_name, index=False)
@@ -62,6 +66,6 @@ def save_data(df, file_name):
     except Exception as e:
         print(f"Failed to save {file_name}: {e}")
 
-
+# Sauvegarde des fichiers nettoyés
 for name, df in dfs.items():
     save_data(df, f"cleaned_{name}.csv")
-- 
GitLab