Super learner
This user is a Super Learner. To become a Super Learner, you need to reach Level 8.
Posted on:

23 Apr 2024

0

PREPROCESSING EXERCISE: Not getting desired output.

My Code:

import numpy as np
import pandas as pd
from sklearn.preprocessing import scale

# Extracting the Data from the CSV file
data = np.loadtxt("Audiobooks_data.csv", delimiter=",")
print(f"{data.shape = }")


# Inputs and Targets Extraction
inputs_all = data[:, 1:-1]
targets_all = data[:, -1]


# Shuffling the Dataset
data_size_range = np.arange(inputs_all.shape[0])
np.random.shuffle(data_size_range)

inputs_shuffled_all = inputs_all[data_size_range]
targets_shuffled_all = targets_all[data_size_range]


# Balancing the Dataset
target_value = int(np.sum(targets_shuffled_all))
zeroes = 0
drop_values = []

for _ in range(len(data_size_range)):
    if targets_shuffled_all[_] == 0:
        zeroes += 1
        if zeroes > target_value:
            drop_values.append(_)
            
balanced_inputs = np.delete(inputs_shuffled_all, drop_values, axis=0)
balanced_targets = np.delete(targets_shuffled_all, drop_values, axis=0)

balanced_inputs_scaled = scale(balanced_inputs)


# Train-Validation-Test Split
data_size = balanced_inputs_scaled.shape[0]
train_size = int(.8 * data_size)
val_size = int(.1 * data_size)
test_size = data_size - train_size - val_size

X_train = balanced_inputs_scaled[:train_size]
X_val = balanced_inputs_scaled[train_size : train_size + val_size]
X_test = balanced_inputs_scaled[train_size + val_size:]

y_train = balanced_targets[:train_size]
y_val = balanced_targets[train_size : train_size + val_size]
y_test = balanced_targets[train_size + val_size:]


# Moment of Truth
print(np.sum(y_train), train_size, np.sum(y_train) / train_size)
print(np.sum(y_val), val_size, np.sum(y_val) / val_size)
print(np.sum(y_test), test_size, np.sum(y_test) / test_size)

Output:

data.shape = (14084, 12)
1342.0 3579 0.37496507404302876
447.0 447 1.0
448.0 448 1.0
0 answers ( 0 marked as helpful)

Submit an answer