github link : https://github.com/amirshnll/Internet-Advertisements
email : [email protected]
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
ads=pd.read_csv('adsdata.csv')
#ads.head(500)
#from pandas.plotting import scatter_matrixe
######### train_test_split ############
#from sklearn.model_selection import train_test_split
#x=final.t
#y=df_num
#y_train , y_test ,x_train, x_test= train_test_split(y,x,test_size=0.2, random_state=30)
#train_set.shap
#print("x_train _set:", y_train.shape)
#print("x_test_set:",y_test.shape)
#print("y_train _set:", x_train.shape)
#print("y_test_set:",x_test.shape)
########
df=train_set.copy()
df_lable=df["t"].copy()
df= ads.drop(("t"),axis=1)
df_num=df.drop("A", axis=1)
df_num=df_num.drop("B", axis=1)
df_num=df_num.drop("C", axis=1)
df_num=df_num.drop("D", axis=1)
#df_num
#from pandas.plotting import scatter_matrixe
######### train_test_split ############
from sklearn.model_selection import train_test_split
y=final.t
y_train , y_test ,df_num_train, df_num_test= train_test_split(y,df_num,test_size=0.2, random_state=30)
#train_set.shap
print("df_num_train _set:", df_num_train.shape)
print("df_num_test_set:",df_num_test.shape)
print("y_train _set:", y_train.shape)
print("y_test_set:",y_test.shape)
########
df=train_set.copy()
df_lable=df["t"].copy()
df= ads.drop(("t"),axis=1)
df_num=df.drop("A", axis=1)
df_num=df_num.drop("B", axis=1)
df_num=df_num.drop("C", axis=1)
df_num=df_num.drop("D", axis=1)
df_num
x_train _set: (2623, 1554)
x_test_set: (656, 1554)
y_train _set: (2623,)
y_test_set: (656,)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | Unnamed: 10 | Unnamed: 11 | Unnamed: 12 | Unnamed: 13 | ... | Unnamed: 1548 | Unnamed: 1549 | Unnamed: 1550 | Unnamed: 1551 | Unnamed: 1552 | Unnamed: 1553 | Unnamed: 1554 | Unnamed: 1555 | Unnamed: 1556 | Unnamed: 1557 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3274 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3275 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3276 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3277 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3278 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3279 rows ร 1554 columns
#from sklearn.preprocessing import OneHotEncoder
#encoder_1hot =OneHotEncoder(sparse=False)
#data_cat_1hot_tmp=encoder_1hot.fit_transform(ads[["t"]])
#data_cat_1hot=pd.DataFrame(data_cat_1hot_tmp)
#data_cat_1hot.columns=encoder_1hot.get_feature_names(['test'])
#data_cat_1hot.head()
##############################final Data##########################
#final=pd.concat([ df_num , data_cat_1hot ],axis=1)
#final.head(10)
#final.head(10)
#final.describe()
#import seaborn as sns
#sns.countplot(x='test', data=final ,palette='hls')
#plt.show()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | Unnamed: 10 | Unnamed: 11 | Unnamed: 12 | Unnamed: 13 | ... | Unnamed: 1550 | Unnamed: 1551 | Unnamed: 1552 | Unnamed: 1553 | Unnamed: 1554 | Unnamed: 1555 | Unnamed: 1556 | Unnamed: 1557 | test_ad. | test_nonad. | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 3279.000000 | 3279.000000 | 3279.000000 | 3279.000000 | 3279.000000 | 3279.000000 | 3279.000000 | 3279.000000 | 3279.000000 | 3279.000000 | ... | 3279.000000 | 3279.000000 | 3279.000000 | 3279.000000 | 3279.000000 | 3279.000000 | 3279.000000 | 3279.000000 | 3279.000000 | 3279.000000 |
mean | 0.004270 | 0.011589 | 0.004575 | 0.003355 | 0.003965 | 0.011589 | 0.003355 | 0.004880 | 0.009149 | 0.004575 | ... | 0.003660 | 0.002440 | 0.003050 | 0.006404 | 0.012809 | 0.013419 | 0.009759 | 0.001525 | 0.139982 | 0.860018 |
std | 0.065212 | 0.107042 | 0.067491 | 0.057831 | 0.062850 | 0.107042 | 0.057831 | 0.069694 | 0.095227 | 0.067491 | ... | 0.060393 | 0.049341 | 0.055148 | 0.079783 | 0.112466 | 0.115077 | 0.098320 | 0.039026 | 0.347021 | 0.347021 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
50% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
75% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows ร 1556 columns
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
data_adc=ads["t"]
data_adc_encode=encoder.fit_transform(data_adc)
data_adc_encode= pd.DataFrame(data_adc_encode,columns=["t"])
#data_adc_encode.head(500)
##############################final Data##########################
final=pd.concat([ df_num , data_adc_encode ],axis=1)
final.head(1000)
final.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3279 entries, 0 to 3278
Columns: 1555 entries, Unnamed: 4 to t
dtypes: int32(1), int64(1554)
memory usage: 38.9 MB
import matplotlib.pyplot as plt
import numpy as np
#df_num.hist(bins=40, figsize=(40,40))
#plt.show()
import seaborn as sns
sns.countplot(x='t', data=final ,palette='hls' )
#plt.bar(lable='o')
plt.legend('ad')
plt.show()
count_ads_sub=len(final[final['t']==0])
count_ads=len(final[final['t']==1])
pct_of_ad=count_ads_sub/(count_ads_sub+count_ads)
print("pct_of Data_ad:",pct_of_ad*100)
pct_of_noad=count_ads/(count_ads_sub+count_ads)
print("pct_of Data_nonad:",pct_of_noad*100)
#sns.countplot(x='test_ad.', data=final ,palette='hls')
#plt.show()
pct_of Data_ad: 13.998170173833484
pct_of Data_nonad: 86.00182982616651
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),cmap='viridis')
<AxesSubplot:>
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
dtc=DecisionTreeClassifier()
dtc=dtc.fit(df_num_train,y_train)
dt=dtc.predict(df_num_test)
from sklearn.metrics import classification_report
print (classification_report(y_test,dtc.predict(df_num_test)))
print('Accuacy of Desition Tree Classifire on test set:{:.2f}' .format(dtc.score(df_num_test,y_test)))
from sklearn import tree
plt.figure(figsize=(20,20))
temp=tree.plot_tree(dtc.fit(df_num,y),fontsize=5)
plt.show()
precision recall f1-score support
0 0.84 0.82 0.83 79
1 0.98 0.98 0.98 577
accuracy 0.96 656
macro avg 0.91 0.90 0.91 656
weighted avg 0.96 0.96 0.96 656
Accuacy of Desition Tree Classifire on test set:0.96
#************* Naive Bayes clasifier *******************
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
#gnb.fit(df_num_train, y_train())
y_pred =gnb.fit(df_num_train, y_train).predict(df_num_test)
#y_pred = gnb.predict(df_num_test)
print (classification_report(y_test,gnb.predict(df_num_test)))
print (' Naive Bayes test accuracy: {:.2f} '.format(gnb.score(df_num_test, y_test)))
precision recall f1-score support
0 0.32 0.86 0.47 79
1 0.98 0.75 0.85 577
accuracy 0.76 656
macro avg 0.65 0.81 0.66 656
weighted avg 0.90 0.76 0.80 656
Naive Bayes test accuracy: 0.76
#****************** Mlp Classifier ********************
from sklearn.neural_network import MLPClassifier
mpc=MLPClassifier(hidden_layer_sizes=(5,2), max_iter=1000)
mpc.fit(df_num_train, y_train)
y_predm=mpc.predict(df_num_test)
print ("Mlp Classifier test accuracy: ", mpc.score(df_num_test, y_test))
print (classification_report(y_test,adsk.predict(df_num_test)))
print("knn(k=5) test accuracy;", adsk.score(df_num_test , y_test))
Mlp Classifier test accuracy: 0.9634146341463414
precision recall f1-score support
0 0.88 0.81 0.84 79
1 0.97 0.98 0.98 577
accuracy 0.96 656
macro avg 0.93 0.90 0.91 656
weighted avg 0.96 0.96 0.96 656
#****************** < logistic Regreession Classifier > ************************
from sklearn.linear_model import LogisticRegression
adsr=LogisticRegression(solver='1bfgs')
adsr.fit(y_train.ravel())
y_predr=adsr.predict(y_test)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-167-29cb39e45767> in <module>
3 from sklearn.linear_model import LogisticRegression
4 adsr=LogisticRegression(solver='1bfgs')
----> 5 adsr.fit(y_train.ravel())
6 y_predr=adsr.predict(y_test)
7
TypeError: fit() missing 1 required positional argument: 'y'
#********************** knn ***************************
from sklearn.neighbors import KNeighborsClassifier
k=5
adsk=KNeighborsClassifier(n_neighbors=k)
adsk.fit(df_num_train,y_train)
y_predk=adsk.predict(df_num_test)
print("when k={} neighnors , knn test accuracy:{}".format(k , adsk.score(df_num_test ,y_test)))
print("when k={} neighnors , knn test accuracy:{}".format(k , adsk.score(df_num_train ,y_train)))
print(classification_report(y_test,adsk.predict(df_num_test)))
print("knn(k=5) test accuracy;",adsk.score(df_num_test, y_test))
ran=np.arange(1,30)
train_list=[]
test_list=[]
for i , each in enumerate(ran):
adsk=KNeighborsClassifier(n_neighbors=each)
# adsk.fit(df_num_train,y_train)
adsk.fit(df_num_train,y_train)
test_list.append(adsk.score(df_num_test, y_test))
train_list.append(adsk.score(df_num_train , y_train))
print("when k={} neighnors , knn test accuracy:{}".format(np.max(test_list) ,test_list.index (np.max(test_list))+1))
print("when k={} neighnors , knn test accuracy:{}".format(np.max(train_list) ,train_list.index (np.max(train_list))+1))
plt.figure(figsize=[15,15])
plt.plot(ran,test_list,label='test score')
plt.plot(ran,train_list,label='train score')
plt.xlabel('number of neighbers')
plt.ylabel('numer/count')
plt.xticks(ran)
plt.legend()
plt.show()
when k=5 neighnors , knn test accuracy:0.9527439024390244
when k=5 neighnors , knn test accuracy:0.9672131147540983
precision recall f1-score support
0 0.96 0.63 0.76 79
1 0.95 1.00 0.97 577
accuracy 0.95 656
macro avg 0.96 0.81 0.87 656
weighted avg 0.95 0.95 0.95 656
knn(k=5) test accuracy; 0.9527439024390244
when k=0.9573170731707317 neighnors , knn test accuracy:4
when k=0.9939001143728555 neighnors , knn test accuracy:1
#**************** logistic Regreession Classifier *****************
from sklearn.linear_model import LogisticRegression
#adsr=LogisticRegression(solver='1bfgs')
#adsr.fit(df_num_train,y_train())
#y_predr=adsr.predict(y_test)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
#x_train, x_test, y_train, y_test = train_test_split(df_num, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(df_num_train, y_train)
y_pred = logreg.predict(df_num_test)
print(classification_report(y_test,logreg.predict(df_num_test)))
print('Accuracy of logistic regression classifier on test set:{:.2f}'.format(logreg.score(df_num_test, y_test)))
precision recall f1-score support
0 0.98 0.80 0.88 79
1 0.97 1.00 0.99 577
accuracy 0.97 656
macro avg 0.98 0.90 0.93 656
weighted avg 0.97 0.97 0.97 656
Accuracy of logistic regression classifier on test set:0.97