-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclean_data.py
More file actions
24 lines (18 loc) · 893 Bytes
/
clean_data.py
File metadata and controls
24 lines (18 loc) · 893 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import pandas as pd
# Lê o CSV bruto ignorando linhas mal formadas
df = pd.read_csv('data/Music_Test.csv', on_bad_lines='skip')
# Remove ;; do nome da última coluna e dos valores
df.columns = df.columns.str.strip().str.replace(';;', '').str.strip()
df['Popularity'] = df['Popularity'].astype(str).str.replace(';;', '').str.strip()
# Remove linhas sem dados essenciais
df = df.dropna(subset=['Artist', 'Title', 'Top Genre'])
# Converte tipos numéricos
int_cols = ['Year', 'Streams (Thousand)', 'Energy', 'Danceability',
'Loudness (dB)', 'Liveness', 'Valence', 'Length (Duration)',
'Acousticness', 'Speechiness', 'Popularity']
for col in int_cols:
df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')
# Salva CSV limpo
df.to_csv('data/Music_Test_clean.csv', index=False)
print(f'Linhas salvas: {len(df)}')
print(df.dtypes)