Wine Quality Prediction
4 min readApr 9, 2021
Importing the required modules
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import pandas as pd
import numpy as np
Loading dataset
df = pd.read_csv(“datasets/winequality-red.csv”)
Understanding the dataset
df.head()
df.info()
Visualization
plt.bar(‘quality’,’fixed acidity’,data = df)
plt.bar(‘quality’,’volatile acidity’,data = df)
plt.bar('quality','citric acid',data = df)
plt.bar('quality','residual sugar',data = df)
plt.bar('quality','chlorides',data = df)
plt.bar('quality','free sulfur dioxide',data = df)
plt.bar('quality','total sulfur dioxide',data = df)
plt.scatter('quality','density',data = df)
plt.bar('quality','pH',data = df)
plt.bar('quality','sulphates',data = df)
plt.bar('quality','alcohol',data = df)
plt.plot(df["quality"])
Classifying the wine quality as good or bad
#bins will set the limits for the classification.
bins = (3, 6, 8) #qualities ranging from 3–6 are classified as bad and 6–8 as good
group_names = [‘bad’, ‘good’]
df[“quality”] = pd.cut(df[“quality”], bins = bins, labels = group_names)
df["quality"].head()
#one hot-encoding
df[“quality”] = pd.get_dummies(df[“quality”],drop_first=True)
df[“quality”][:5]
df[“quality”].value_counts()
Separating the dataset as target variable and feature variables
X = df.drop(“quality”, axis = 1)
y = df[‘quality’]
X.head()
y.head()
Splitting the dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
converting y_train to 1-D
y_train = np.ravel(y_train)
y_train
Applying Standard scaling to get optimized result
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)
Creating the models
models = [ 'DecisionTreeClassifier', 'Support Vector Machine', 'GaussianNaiveBayes', 'KNeighborsClassifier', 'RandomForestClassifier']
accuracy_score_list = []
DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
pred_dtc = dtc.predict(X_test)acc = accuracy_score(y_test, pred_dtc)
accuracy_score_list.append(acc)
print(acc)
Support Vector Machine
svm = SVC()
svm.fit(X_train, y_train)
pred_svm = svm.predict(X_test)acc = accuracy_score(y_test, pred_svm)
accuracy_score_list.append(acc)
print(acc)
GaussianNaiveBayes
gnb = GaussianNB()
gnb.fit(X_train, y_train)
pred_gnb = gnb.predict(X_test)acc = accuracy_score(y_test, pred_gnb)
accuracy_score_list.append(acc)
print(acc)
KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=22)
knn.fit(X_train, y_train)
pred_knn = knn.predict(X_test)acc = accuracy_score(y_test, pred_knn)
accuracy_score_list.append(acc)
print(acc)
RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)acc = accuracy_score(y_test, pred_rfc)
accuracy_score_list.append(acc)
print(acc)
compare = pd.DataFrame({'Algorithms' : models , 'accuracy_score' : accuracy_score_list})
compare.sort_values(by='accuracy_score' ,ascending=False)
plt.plot(compare['Algorithms'], compare['accuracy_score'], label = "accuracy_score")