ValueError: Length mismatch: Expected axis has 15 elements, new values have 16 elements

Question

I need help identifying the source of this error code. I initially wrote
some codes to design linear regression algorithms to determine the
probability of a customer purchasing a bike, After the initial
preprocessing and training/testing (Which worked), I am trying to
combine the codes to create a module for reusability. I encountered the
Mismatch error code. What do you think I'm not doing right and how do I
avoid the scenario.

import numpy as np
import pandas as pd
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator,TransformerMixin):

def __init__(self,columns):

self.scaler = StandardScaler()
        self.columns = columns
        self.mean_ = None
        self.var_ = None

def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

def transform(self, X, y=None, copy=None):

init_col_order = X.columns

X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)

X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]

return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

class Purchased_Bike_model():

def __init__(self, model_file, scaler_file):
        with open('model','rb') as model_file, open('scaler', 'rb') as scaler_file:
                self.reg = pickle.load(model_file)
                self.scaler = pickle.load(scaler_file)
                self.data = None

def load_and_clean_data(self, data_file):

# import the data
            df = pd.read_csv(data_file,delimiter=',')
            # store the data in a new variable for later use
            self.df_with_predictions = df.copy()
            # drop the 'ID' column
            df = df.drop(['ID'], axis = 1)

# create a separate dataframe, containing dummy values for ALL occupations
Occupation_column = pd.get_dummies(df['Occupation'], drop_first = True)

# to avoid multicollinearity, drop the 'Occupation' column from df
df = df.drop(['Occupation'], axis = 1)

# concatenate df and the separate occupation dummies
df = pd.concat([df, Occupation_column], axis = 1)

# note: there is a more universal version of this code, however the following will best suit our current purposes
            column_names = ['Marital Status', 'Gender', 'Income', 'Children', 'Education','Home Owner', 'Cars',
                            'Commute Distance', 'Region', 'Age', 'Purchased Bike', 'Clerical', 'Management',
                            'Manual', 'Professional', 'Skilled Manual']
            df.columns = column_names

# re-order the columns in df
            column_names_reordered = ['Marital Status', 'Gender', 'Income', 'Children', 'Education', 'Clerical',
                                      'Management', 'Manual', 'Professional', 'Skilled Manual', 'Home Owner', 'Cars',
                                      'Commute Distance', 'Region', 'Age', 'Purchased Bike']
            df = df[column_names_reordered]

# encode all categorical variables
            df['Marital Status'] = df['Marital Status'].map({'M':1, 'S':0})
            df['Gender'] = df['Gender'].map({'M':1, 'F':0})
            df['Education'] = df['Education'].map({'Partial High School':0,'High School':1,'Partial College':2,
                                                    'Bachelors':3, 'Graduate Degree':4})
            df['Home Owner'] = df['Home Owner'].map({'Yes':1, 'No':0})
            df['Commute Distance'] = df['Commute Distance'].map({'0-1 Miles':0, '1-2 Miles':1, '2-5 Miles':2,
                                                                 '5-10 Miles':3, '10+ Miles':4})
            df['Region'] = df['Region'].map({'Europe':0, 'Pacific':1, 'North America':2})
            df['Purchased Bike'] = df['Purchased Bike'].map({'Yes':1, 'No':0})

df = df.drop(['Purchased Bike'],axis=1)

# we have included this line of code if you want to call the 'preprocessed data'
self.preprocessed_data = df.copy()

# we need this line so we can use it in the next functions
self.data = self.scaler.transform(df)

# a function which outputs the probability of a data point to be 1
    def predicted_probability(self):
            if (self.data is not None):
                pred = self.reg.predict_proba(self.data)[:,1]
                return pred

# a function which outputs 0 or 1 based on our model
    def predicted_output_category(self):
            if (self.data is not None):
                pred_outputs = self.reg.predict(self.data)
                return pred_outputs

# predict the outputs and the probabilities and
        # add columns with these values at the end of the new data
    def predicted_outputs(self):
            if (self.data is not None):
                self.preprocessed_data['Probability'] = self.reg.predict_proba(self.data)[:,1]
                self.preprocessed_data ['Prediction'] = self.reg.predict(self.data)
                return self.preprocessed_data

Answer 1

https://github.com/Kile-kun/Data_Analysis-Projects/blob/main/Bike%20Purchase%20Project/Bike%20Sales%20Raw.csv This is the link to the csv file

Answer 2

Hi Raji,
thanks for reaching out! If you're interested in finding out purchase probability or want to learn more about how to design a linear regression you're welcome to try our Machine Learning in Python course or our course on Customer Analytics in Python:
Machine Learning in Python Course | 365 Data Science
Customer Analytics in Python Course | 365 Data Science
I'm more than happy to offer any guidance related to the course material and practical case studies, though, please note that debugging unrelated student projects goes beyond the scope of what we offer.

Best,
365 Eli

Answer 3

RAJI BABATUNDE

Posted on:

29 Mar 2022

0

ValueError: Length mismatch: Expected axis has 15 elements, new values have 16 elements

Submit an answer

related questions