Sklearn PolynomialFeatures ~Feature Engineering The best way to automate n improve our machine learning model accuracy
Sklearn PolynomialFeatures ~Feature Engineering
The best way to automate n improve our machine learning model accuracy

Hi there, hows things are up to? i hope it's good. I came across to an interesting topic, Feature engineering using our favorite sklearn package.
Sklearn provides a powerful package to create new features where the new features are combinations of interactions. And with a goal of such transformations to raise the number of input features leads to better model understanding and accuracy.
If you can remember when we were at the stone age of machine learning model building we use to manually create combinations of features recursively hoping for better model accuracy was a very tedious and painful experience This is where the sklearn ‘PolynomialFeatures’ package comes into rescue.
Polynomial Features are created by raising existing features to an exponent.
Let’s understand this with the help of an example:
#simple
# demonstrate the types of features created
from numpy import asarray
from sklearn.preprocessing import PolynomialFeatures
# define the dataset
data = asarray([[2,3],[2,3],[2,3]])
print(data)
# perform a polynomial features transform of the dataset
trans = PolynomialFeatures(degree=2)
data = trans.fit_transform(data)
print(data)
#“degree” argument controls the number of features created and defaults to 2.
#“interaction_only” argument means that only the raw values (degree 1) and the interaction (pairs of values multiplied with each other) are included, defaulting to False.
#“include_bias” argument defaults to True to include the bias feature.
Now let's try this with a dataset
#Without Polynomial Features---------------------
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot
dataset = read_csv("sonar.csv", header=None)
data = dataset.values
#separate into input and output columns
X, y = data[:, :-1], data[:, -1]
#ensure inputs are floats and output is an integer label
X = X.astype('float32')
y = LabelEncoder().fit_transform(y.astype('str'))
#define and configure the model
model = KNeighborsClassifier()
#evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
#report model performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
Accuracy: 0.797 (0.073)
#With Polynomial Features-------------------------------
#visualize a polynomial features transform of the sonar dataset
from pandas import read_csv
from pandas import DataFrame
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import PolynomialFeatures
from matplotlib import pyplot
dataset = read_csv("sonar.csv", header=None)
#retrieve just the numeric input values
data = dataset.values[:, :-1]
#perform a polynomial features transform of the dataset
trans = PolynomialFeatures(degree=3)
data = trans.fit_transform(data)
#convert the array back to a dataframe
dataset = DataFrame(data)
print("Shape of the dataset after Polynomial Features",(dataset.shape)
(208, 39711) Clearly, we can observe from 61 features to 39711 features.
#let's evaluate the model performance with the new features
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from matplotlib import pyplot
dataset = read_csv("sonar.csv", header=None)
data = dataset.values
#separate into input and output columns
X, y = data[:, :-1], data[:, -1]
#ensure inputs are floats and output is an integer label
X = X.astype('float32')
y = LabelEncoder().fit_transform(y.astype('str'))
#define the pipeline
trans = PolynomialFeatures(degree=3)
model = KNeighborsClassifier()
pipeline = Pipeline(steps=[('t', trans), ('m', model)])
#evaluate the pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
#report pipeline performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
Accuracy: 0.800 (0.077) Well the accuracy improved even a bit.
We can also understand what will be the effect of the degree of polynomial features.
######Effect of Polynomial Degree--------------------
#The degree of the polynomial dramatically increases the number of input features.
# compare the effect of the degree on the number of created features
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from matplotlib import pyplot
# get the dataset
def get_dataset():
# load dataset
url = "sonar.csv"
dataset = read_csv(url, header=None)
data = dataset.values
# separate into input and output columns
X, y = data[:, :-1], data[:, -1]
# ensure inputs are floats and output is an integer label
X = X.astype('float32')
y = LabelEncoder().fit_transform(y.astype('str'))
return X, y
# define dataset
X, y = get_dataset()
# calculate change in number of features
num_features = list()
degress = [i for i in range(1, 6)]
for d in degress:
# create transform
trans = PolynomialFeatures(degree=d)
# fit and transform
data = trans.fit_transform(X)
# record number of features
num_features.append(data.shape[1])
# summarize
print('Degree: %d, Features: %d' % (d, data.shape[1]))
# plot degree vs number of features
pyplot.plot(degress, num_features)
pyplot.show()
Degree: 1, Features: 61
Degree: 2, Features: 1891
Degree: 3, Features: 39711
Degree: 4, Features: 635376
Degree: 5, Features: 8259888

#Line Plot of the Degree vs. the Number of Input Features for the Polynomial Feature Transform
#More features may result in more overfitting, and in turn, worse results.
Now
Hyperparameter tuning ~ what is the degree of polynomial features that will give the best accuracy.
#Hyperparameter tunning
#explore the effect of degree on accuracy for the polynomial features transform
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from matplotlib import pyplot
# get the dataset
def get_dataset():
# load dataset
url = "sonar.csv"
dataset = read_csv(url, header=None)
data = dataset.values
# separate into input and output columns
X, y = data[:, :-1], data[:, -1]
# ensure inputs are floats and output is an integer label
X = X.astype('float32')
y = LabelEncoder().fit_transform(y.astype('str'))
return X, y
# get a list of models to evaluate
def get_models():
models = dict()
for d in range(1,5):
# define the pipeline
trans = PolynomialFeatures(degree=d)
model = KNeighborsClassifier()
models[str(d)] = Pipeline(steps=[('t', trans), ('m', model)])
return models
# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
return scores
# define dataset
X, y = get_dataset()
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model, X, y)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
#Box Plots of Degree for the Polynomial degree
Degree 1 ~0.797 (0.073)
Degree 2 ~0.793 (0.085)
Degree 3 ~0.800 (0.077)
Degree 4 ~0.795 (0.079)

We can see that degree 3 performs the best in lifting the model accuracy.
That's it.
It's great to know all these new advanced techniques. Thanks to the author Jason Brownlee for enlightening us with this hidden yet powerful feature engineering function.
Thanks again, for your time, if you enjoyed this short article there are tons of topics in advanced analytics, data science, and machine learning available in my medium repo. https://medium.com/@bobrupakroy
Some of my alternative internet presences Facebook, Instagram, Udemy, Blogger, Issuu, Slideshare, Scribd and more.
Also available on Quora @ https://www.quora.com/profile/Rupak-Bob-Roy
Let me know if you need anything. Talk Soon.

Comments
Post a Comment