From 1d90c6f34827925c1dd7bd4355cbaf631417372b Mon Sep 17 00:00:00 2001
From: Nasmat Arouna <arounanasmat@gmail.com>
Date: Tue, 27 May 2025 16:56:56 +0200
Subject: [PATCH] =?UTF-8?q?R=C3=A9organisation=20des=20liens=20de=20la=20t?=
=?UTF-8?q?able=20de=20faits?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
data/cleaned_addresses.csv | 2 +-
data/cleaned_customers.csv | 2 +-
data/cleaned_departments.csv | 2 +-
data/cleaned_order_details.csv | 2 +-
data/cleaned_orders.csv | 2 +-
data/cleaned_products.csv | 2 +-
src/DataCleaner.py | 50 ++++++++++++++++++----------------
7 files changed, 33 insertions(+), 29 deletions(-)
diff --git a/data/cleaned_addresses.csv b/data/cleaned_addresses.csv
index b2850f5..a19ed34 100644
--- a/data/cleaned_addresses.csv
+++ b/data/cleaned_addresses.csv
@@ -1,4 +1,4 @@
-address id,customer id,country,city,state,postal code,region
+address_id,customer_id,country,city,state,postal_code,region
CG-12520_42420,CG-12520,United States,Henderson,Kentucky,42420,South
DV-13045_90036,DV-13045,United States,Los Angeles,California,90036,West
SO-20335_33311,SO-20335,United States,Fort Lauderdale,Florida,33311,South
diff --git a/data/cleaned_customers.csv b/data/cleaned_customers.csv
index 1c9e2ca..2ee0722 100644
--- a/data/cleaned_customers.csv
+++ b/data/cleaned_customers.csv
@@ -1,4 +1,4 @@
-customer id,customer name,segment
+customer_id,customer_name,segment
CG-12520,Claire Gute,Consumer
DV-13045,Darrin Van Huff,Corporate
SO-20335,Sean O'Donnell,Consumer
diff --git a/data/cleaned_departments.csv b/data/cleaned_departments.csv
index 10e7a88..11e9d63 100644
--- a/data/cleaned_departments.csv
+++ b/data/cleaned_departments.csv
@@ -1,4 +1,4 @@
-department,city,storage capacity
+department,city,storage_capacity
Furniture,New York City,3003
Office Supplies,San Francisco,4043
Technology,Philadelphia,2257
diff --git a/data/cleaned_order_details.csv b/data/cleaned_order_details.csv
index 6e16d92..f2a20c5 100644
--- a/data/cleaned_order_details.csv
+++ b/data/cleaned_order_details.csv
@@ -1,4 +1,4 @@
-order id,product id,sales,quantity,profit
+order_id,product_id,sales,quantity,profit
CA-2016-152156,FUR-BO-10001798,261.96,2,41.91
CA-2016-152156,FUR-CH-10000454,731.94,3,219.58
CA-2016-138688,OFF-LA-10000240,14.62,2,6.87
diff --git a/data/cleaned_orders.csv b/data/cleaned_orders.csv
index 6d46909..2ac260f 100644
--- a/data/cleaned_orders.csv
+++ b/data/cleaned_orders.csv
@@ -1,4 +1,4 @@
-order id,order date,ship date,customer id,address id
+order_id,order_date,ship_date,customer_id,address_id
CA-2016-152156,2021-11-02,2021-11-05,CG-12520,CG-12520_42420
CA-2016-138688,2021-06-06,2021-06-10,DV-13045,DV-13045_90036
US-2015-108966,2020-10-04,2020-10-11,SO-20335,SO-20335_33311
diff --git a/data/cleaned_products.csv b/data/cleaned_products.csv
index 7348de4..b926c15 100644
--- a/data/cleaned_products.csv
+++ b/data/cleaned_products.csv
@@ -1,4 +1,4 @@
-product id,department,sub-category,product name
+product_id,department,sub-category,product_name
FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase
FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back"
OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters by Universal
diff --git a/src/DataCleaner.py b/src/DataCleaner.py
index 9fef7a8..17f9dd5 100644
--- a/src/DataCleaner.py
+++ b/src/DataCleaner.py
@@ -4,41 +4,38 @@ files = {}
dfs = {}
fileNames = ["addresses", "customers", "departments", "order_details", "orders", "products"]
-
def load_file(file_path):
df = pd.read_csv(file_path)
files[file_path] = df
return df
-
def sanitize_data(df):
- # dropping duplicates
df = df.drop_duplicates()
- # drop missing values
df = df.dropna()
- # renaming columns to lowercase
- df.columns = [col.lower() for col in df.columns]
+ # on retire les espaces et on met tout en minuscule
+ df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]
return df
-
+# Charger et nettoyer
for name in fileNames:
try:
- # importing files
df = load_file("../data/" + name + ".csv")
- dfs[name] = df
- print(f"Loaded {name}")
- # cleaning data
dfs[name] = sanitize_data(df)
- print(f"Sanitized {name}")
+ print(f"Loaded and sanitized {name}")
except Exception as e:
print(f"Failed to load {name} : {e}")
-# linking tables
+# Fusionner les tables
linked_dfs = []
-links = [['customers', 'orders', 'customer id'], ['orders', 'order_details', 'order id'],
- ['order_details', 'products', 'product id'], ['departments', 'products', 'department'],
- ['orders', 'addresses', 'address id'], ['orders', 'order_details', 'order id']]
+# ⬇Remarque : colonnes en minuscules et underscore
+links = [
+ ['orders', 'order_details', 'order_id'],
+ ['products', 'order_details', 'product_id'],
+ ['departments', 'order_details', 'department'],
+ ['customers', 'orders', 'customer_id'],
+ ['addresses', 'orders', 'address_id']
+]
def link_data(df1, df2, key):
try:
@@ -48,13 +45,20 @@ def link_data(df1, df2, key):
print(f"Failed to link DataFrames on {key}: {e}")
return None
-
+# Lancer les jointures
for link in links:
- linked_dfs.append(link_data(dfs[link[0]], dfs[link[1]], link[2]))
- print(f"Linked {link[0]} and {link[1]} on {link[2]}")
-
-
-# saving cleaned and linked data
+ try:
+ df1 = dfs[link[0]]
+ df2 = dfs[link[1]]
+ key = link[2]
+ linked_df = link_data(df1, df2, key)
+ if linked_df is not None:
+ linked_dfs.append(linked_df)
+ print(f"Linked {link[0]} and {link[1]} on {key}")
+ except KeyError as e:
+ print(f"Missing key in dfs: {e}")
+
+# Sauvegarde des fichiers nettoyés
def save_data(df, file_name):
try:
df.to_csv("../data/" + file_name, index=False)
@@ -62,6 +66,6 @@ def save_data(df, file_name):
except Exception as e:
print(f"Failed to save {file_name}: {e}")
-
+# Sauvegarde des fichiers nettoyés
for name, df in dfs.items():
save_data(df, f"cleaned_{name}.csv")
--
GitLab