ValueError: Length mismatch: Expected axis has 15 elements, new values have 16 elements
I need help identifying the source of this error code. I initially wrote
some codes to design linear regression algorithms to determine the
probability of a customer purchasing a bike, After the initial
preprocessing and training/testing (Which worked), I am trying to
combine the codes to create a module for reusability. I encountered the
Mismatch error code. What do you think I'm not doing right and how do I
avoid the scenario.
import numpy as np
import pandas as pd
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
class CustomScaler(BaseEstimator,TransformerMixin):
def __init__(self,columns):
self.scaler = StandardScaler()
self.columns = columns
self.mean_ = None
self.var_ = None
def fit(self, X, y=None):
self.scaler.fit(X[self.columns], y)
self.mean_ = np.mean(X[self.columns])
self.var_ = np.var(X[self.columns])
return self
def transform(self, X, y=None, copy=None):
init_col_order = X.columns
X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
class Purchased_Bike_model():
def __init__(self, model_file, scaler_file):
with open('model','rb') as model_file, open('scaler', 'rb') as scaler_file:
self.reg = pickle.load(model_file)
self.scaler = pickle.load(scaler_file)
self.data = None
def load_and_clean_data(self, data_file):
# import the data
df = pd.read_csv(data_file,delimiter=',')
# store the data in a new variable for later use
self.df_with_predictions = df.copy()
# drop the 'ID' column
df = df.drop(['ID'], axis = 1)
# create a separate dataframe, containing dummy values for ALL occupations
Occupation_column = pd.get_dummies(df['Occupation'], drop_first = True)
# to avoid multicollinearity, drop the 'Occupation' column from df
df = df.drop(['Occupation'], axis = 1)
# concatenate df and the separate occupation dummies
df = pd.concat([df, Occupation_column], axis = 1)
# note: there is a more universal version of this code, however the following will best suit our current purposes
column_names = ['Marital Status', 'Gender', 'Income', 'Children', 'Education','Home Owner', 'Cars',
'Commute Distance', 'Region', 'Age', 'Purchased Bike', 'Clerical', 'Management',
'Manual', 'Professional', 'Skilled Manual']
df.columns = column_names
# re-order the columns in df
column_names_reordered = ['Marital Status', 'Gender', 'Income', 'Children', 'Education', 'Clerical',
'Management', 'Manual', 'Professional', 'Skilled Manual', 'Home Owner', 'Cars',
'Commute Distance', 'Region', 'Age', 'Purchased Bike']
df = df[column_names_reordered]
# encode all categorical variables
df['Marital Status'] = df['Marital Status'].map({'M':1, 'S':0})
df['Gender'] = df['Gender'].map({'M':1, 'F':0})
df['Education'] = df['Education'].map({'Partial High School':0,'High School':1,'Partial College':2,
'Bachelors':3, 'Graduate Degree':4})
df['Home Owner'] = df['Home Owner'].map({'Yes':1, 'No':0})
df['Commute Distance'] = df['Commute Distance'].map({'0-1 Miles':0, '1-2 Miles':1, '2-5 Miles':2,
'5-10 Miles':3, '10+ Miles':4})
df['Region'] = df['Region'].map({'Europe':0, 'Pacific':1, 'North America':2})
df['Purchased Bike'] = df['Purchased Bike'].map({'Yes':1, 'No':0})
df = df.drop(['Purchased Bike'],axis=1)
# we have included this line of code if you want to call the 'preprocessed data'
self.preprocessed_data = df.copy()
# we need this line so we can use it in the next functions
self.data = self.scaler.transform(df)
# a function which outputs the probability of a data point to be 1
def predicted_probability(self):
if (self.data is not None):
pred = self.reg.predict_proba(self.data)[:,1]
return pred
# a function which outputs 0 or 1 based on our model
def predicted_output_category(self):
if (self.data is not None):
pred_outputs = self.reg.predict(self.data)
return pred_outputs
# predict the outputs and the probabilities and
# add columns with these values at the end of the new data
def predicted_outputs(self):
if (self.data is not None):
self.preprocessed_data['Probability'] = self.reg.predict_proba(self.data)[:,1]
self.preprocessed_data ['Prediction'] = self.reg.predict(self.data)
return self.preprocessed_data
https://github.com/Kile-kun/Data_Analysis-Projects/blob/main/Bike%20Purchase%20Project/Bike%20Sales%20Raw.csv This is the link to the csv file
Hi Raji,
thanks for reaching out! If you're interested in finding out purchase probability or want to learn more about how to design a linear regression you're welcome to try our Machine Learning in Python course or our course on Customer Analytics in Python:
Machine Learning in Python Course | 365 Data Science
Customer Analytics in Python Course | 365 Data Science
I'm more than happy to offer any guidance related to the course material and practical case studies, though, please note that debugging unrelated student projects goes beyond the scope of what we offer.
Best,
365 Eli