Professional Documents
Culture Documents
November 4, 2018
1
#### removing duplicate copies
sorted_data=df_sample.sort_values('ProductId', axis=0, ascending=True, inplace=False, ki
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='f
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]
In [7]: DataPoints, Features = final.shape
l= final["review"].unique()
x,y= final["review"].value_counts()
print("Final Data \n")
# print(len(l))
print ("Total number of Datapoints -",DataPoints,'\n\n'
'Total Number Features or Independent Variable -',Features-1,'\n')
print('The Positive review has label \"{}\" and \"{}\" datapoint \n'.format(l[0],x))
print('The Negatve review has class label \"{}\" and \"{}\" datapoints'.format(l[1],y))
Final Data
The Negatve review has class label "negative" and "86879" datapoints
2
In [11]: # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.h
# encoding label data
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)
2 Grid Search
In [12]: #https://www.kaggle.com/laowingkin/amazon-fine-food-review-sentiment-analysis
vectorizer = CountVectorizer(stop_words = 'english')
BoW = vectorizer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(BoW, Y, test_size=0.20)
X_train.shape
#BoW
modelBoW = GridSearchCV(LogisticRegression(), tuned_parameters,scoring = 'f1', cv=5)
modelBoW.fit(X_test, y_test)
#TFiDF
model_tf = GridSearchCV(LogisticRegression(), tuned_parameters,scoring = 'f1', cv=5)
model_tf.fit(Xtest, ytest)
3
3 Random search
In [23]: from sklearn.model_selection import RandomizedSearchCV
#https://chrisalbon.com/machine_learning/model_selection/hyperparameter_tuning_using_ra
from scipy.stats import uniform
# Create regularization penalty space
penalty = ['l1', 'l2']
#BoW
modelBoWr=RandomizedSearchCV(LogisticRegression(), hyperparameters,scoring = 'f1', cv=
modelBoWr.fit(X_test, y_test)
#TFiDF
model_tfr = RandomizedSearchCV(LogisticRegression(), hyperparameters,scoring = 'f1', c
model_tfr.fit(Xtest, ytest)
#BoW -penalty l2
4
clfBoW2 = LogisticRegression(penalty = 'l2')
clfBoW2.fit( X_train,y_train)
predBoW2=clfBoW2.predict(X_test)
Accuracy2=round(accuracy_score(y_test,predBoW2,normalize=True),4)
#Tf-IDF -penalty l1
#Tf-IDF -penalty l2
for i in C:
clf = LogisticRegression(C=i, penalty='l1')
clf.fit(X_train, y_train)
pred=clf.predict(X_test)
accuracy=round(accuracy_score(y_test,pred,normalize=True),4)
S=1-round(accuracy,5)
Score.append(S)
Sp =np.count_nonzero(clf.coef_)
Sparse.append(Sp)
5
print(coeff_df.head(20).to_string(index=False))
print('')
print('-Top 20 negative-')
print(coeff_df.tail(20).to_string(index=False))
-Top 20 positive-
Coefficient Word
11.250287 highly
10.779505 hooked
9.960425 great
9.807265 best
9.719121 delicious
9.653650 perfect
8.752537 skeptical
8.719017 excellent
8.168333 wonderful
7.925378 pleasantly
7.855584 loves
7.799740 amazing
7.245107 yum
7.203459 glad
6.977704 thank
6.942655 love
6.685501 awesome
6.635842 pleased
6.539283 worry
6.310895 yummy
-Top 20 negative-
Coefficient Word
-6.755963 hopes
-6.844374 hoping
-7.077717 stale
-7.079282 deceptive
-7.250155 okay
-7.597882 return
-7.748032 horrible
-7.773034 disgusting
-7.919038 bland
-8.029840 threw
-8.100359 ok
-8.333270 refund
-8.527532 disappointed
-8.796234 mediocre
-9.760861 disappointment
-10.095175 awful
-10.241993 terrible
6
-10.337588 unfortunately
-13.506665 disappointing
-14.761232 worst
3.4 Results
In [52]: from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Model", "Model Selection Method","penalty", "Optimal lambda"]
# print(x.get_string(title="Australian cities"))
print(x)
In [53]: y = PrettyTable()
y.field_names = ["Model","penalty", "Acuuracy"]
y.add_row(["BoW", "L1",round(Accuracy*100,2)])
y.add_row(["BoW", "L2",round(Accuracy2*100,2)])
y.add_row(["TfIDf", "L1",round(Accuracy3*100,2)])
y.add_row(["TfIDF", "L2",round(Accuracy4*100,2)])
7
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
# print(x.get_string(title="Australian cities"))
print(y)
In [44]: #https://stackoverflow.com/questions/22276066/how-to-plot-multiple-functions-on-the-sam
from sklearn import preprocessing
# [0.001,0.005,0.01,0.05,0.1,0.5,1]
#scaling to plot them
Sparse =np.array(Sparse)
Sparse.reshape(1,-1)
Score = np.array(Score)
Score.reshape(1,-1)
Sparse_normalized = (Sparse-min(Sparse))/(max(Sparse)-min(Sparse))
Score_normalized = (Score-min(Score))/(max(Score)-min(Score))
C=np.array(C)
lambd=1/C
#https://matplotlib.org/users/legend_guide.html
plt.loglog(lambd, Sparse_normalized, label='Sparse') # plotting t, a separately
plt.loglog(lambd, Score_normalized, label='Error') # plotting t, b separately
plt.xlabel('Lambda')
plt.ylabel('Normalized log Scale')
plt.title('Realtion between Lambda, Error and Sparsity')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
8
9