The internet-advertisements's intro from semnan-university-ai

Author : Amir Shokri

github link : https://github.com/amirshnll/Internet-Advertisements

dataset link : http://archive.ics.uci.edu/ml/datasets/Internet+Advertisements

email : [email protected]

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import  seaborn as sns

ads=pd.read_csv('adsdata.csv')
#ads.head(500)

#from  pandas.plotting import scatter_matrixe
######### train_test_split  ############
#from sklearn.model_selection import train_test_split
#x=final.t
#y=df_num
#y_train , y_test ,x_train, x_test= train_test_split(y,x,test_size=0.2, random_state=30)
#train_set.shap
#print("x_train _set:", y_train.shape)
#print("x_test_set:",y_test.shape)
#print("y_train _set:", x_train.shape)
#print("y_test_set:",x_test.shape)
########
df=train_set.copy()
df_lable=df["t"].copy()
df= ads.drop(("t"),axis=1)
df_num=df.drop("A", axis=1)
df_num=df_num.drop("B", axis=1)
df_num=df_num.drop("C", axis=1)
df_num=df_num.drop("D", axis=1)
#df_num

#from  pandas.plotting import scatter_matrixe
######### train_test_split  ############
from sklearn.model_selection import train_test_split
y=final.t
y_train , y_test ,df_num_train, df_num_test= train_test_split(y,df_num,test_size=0.2, random_state=30)
#train_set.shap
print("df_num_train _set:", df_num_train.shape)
print("df_num_test_set:",df_num_test.shape)
print("y_train _set:", y_train.shape)
print("y_test_set:",y_test.shape)
########
df=train_set.copy()
df_lable=df["t"].copy()
df= ads.drop(("t"),axis=1)
df_num=df.drop("A", axis=1)
df_num=df_num.drop("B", axis=1)
df_num=df_num.drop("C", axis=1)
df_num=df_num.drop("D", axis=1)
df_num

x_train _set: (2623, 1554)
x_test_set: (656, 1554)
y_train _set: (2623,)
y_test_set: (656,)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	Unnamed: 4	Unnamed: 5	Unnamed: 6	Unnamed: 7	Unnamed: 8	Unnamed: 9	Unnamed: 10	Unnamed: 11	Unnamed: 12	Unnamed: 13	...	Unnamed: 1548	Unnamed: 1549	Unnamed: 1550	Unnamed: 1551	Unnamed: 1552	Unnamed: 1553	Unnamed: 1554	Unnamed: 1555	Unnamed: 1556	Unnamed: 1557
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3274	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3275	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3276	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3277	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3278	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

3279 rows × 1554 columns

#from sklearn.preprocessing import OneHotEncoder 
#encoder_1hot =OneHotEncoder(sparse=False)
#data_cat_1hot_tmp=encoder_1hot.fit_transform(ads[["t"]])
#data_cat_1hot=pd.DataFrame(data_cat_1hot_tmp)
#data_cat_1hot.columns=encoder_1hot.get_feature_names(['test'])
#data_cat_1hot.head()
##############################final Data##########################
#final=pd.concat([     df_num      ,  data_cat_1hot   ],axis=1)
#final.head(10)

#final.head(10)
#final.describe()
#import  seaborn as sns
#sns.countplot(x='test', data=final ,palette='hls')
#plt.show()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	Unnamed: 4	Unnamed: 5	Unnamed: 6	Unnamed: 7	Unnamed: 8	Unnamed: 9	Unnamed: 10	Unnamed: 11	Unnamed: 12	Unnamed: 13	...	Unnamed: 1550	Unnamed: 1551	Unnamed: 1552	Unnamed: 1553	Unnamed: 1554	Unnamed: 1555	Unnamed: 1556	Unnamed: 1557	test_ad.	test_nonad.
count	3279.000000	3279.000000	3279.000000	3279.000000	3279.000000	3279.000000	3279.000000	3279.000000	3279.000000	3279.000000	...	3279.000000	3279.000000	3279.000000	3279.000000	3279.000000	3279.000000	3279.000000	3279.000000	3279.000000	3279.000000
mean	0.004270	0.011589	0.004575	0.003355	0.003965	0.011589	0.003355	0.004880	0.009149	0.004575	...	0.003660	0.002440	0.003050	0.006404	0.012809	0.013419	0.009759	0.001525	0.139982	0.860018
std	0.065212	0.107042	0.067491	0.057831	0.062850	0.107042	0.057831	0.069694	0.095227	0.067491	...	0.060393	0.049341	0.055148	0.079783	0.112466	0.115077	0.098320	0.039026	0.347021	0.347021
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000
50%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000
75%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000
max	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000

8 rows × 1556 columns

from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
data_adc=ads["t"]
data_adc_encode=encoder.fit_transform(data_adc)
data_adc_encode= pd.DataFrame(data_adc_encode,columns=["t"])
#data_adc_encode.head(500)
##############################final Data##########################
final=pd.concat([     df_num      , data_adc_encode   ],axis=1)
final.head(1000) 
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3279 entries, 0 to 3278
Columns: 1555 entries, Unnamed: 4 to t
dtypes: int32(1), int64(1554)
memory usage: 38.9 MB

import matplotlib.pyplot as plt
import numpy as np 
#df_num.hist(bins=40, figsize=(40,40))
#plt.show()
import  seaborn as sns
sns.countplot(x='t', data=final ,palette='hls' )
#plt.bar(lable='o')
plt.legend('ad')
plt.show()
count_ads_sub=len(final[final['t']==0])
count_ads=len(final[final['t']==1])
pct_of_ad=count_ads_sub/(count_ads_sub+count_ads)
print("pct_of Data_ad:",pct_of_ad*100)
pct_of_noad=count_ads/(count_ads_sub+count_ads)
print("pct_of Data_nonad:",pct_of_noad*100)
#sns.countplot(x='test_ad.', data=final ,palette='hls')
#plt.show()

pct_of Data_ad: 13.998170173833484
pct_of Data_nonad: 86.00182982616651

plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),cmap='viridis')

<AxesSubplot:>

from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
dtc=DecisionTreeClassifier()
dtc=dtc.fit(df_num_train,y_train)
dt=dtc.predict(df_num_test)
from sklearn.metrics import classification_report
print (classification_report(y_test,dtc.predict(df_num_test)))
print('Accuacy of Desition Tree Classifire on test set:{:.2f}' .format(dtc.score(df_num_test,y_test)))
from sklearn import tree
plt.figure(figsize=(20,20))
temp=tree.plot_tree(dtc.fit(df_num,y),fontsize=5)
plt.show()

              precision    recall  f1-score   support

           0       0.84      0.82      0.83        79
           1       0.98      0.98      0.98       577

    accuracy                           0.96       656
   macro avg       0.91      0.90      0.91       656
weighted avg       0.96      0.96      0.96       656

Accuacy of Desition Tree Classifire on test set:0.96

#*************      Naive Bayes clasifier       *******************
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
#gnb.fit(df_num_train, y_train())
y_pred =gnb.fit(df_num_train, y_train).predict(df_num_test)
#y_pred = gnb.predict(df_num_test)
print (classification_report(y_test,gnb.predict(df_num_test)))
print (' Naive Bayes test accuracy:  {:.2f} '.format(gnb.score(df_num_test, y_test)))

              precision    recall  f1-score   support

           0       0.32      0.86      0.47        79
           1       0.98      0.75      0.85       577

    accuracy                           0.76       656
   macro avg       0.65      0.81      0.66       656
weighted avg       0.90      0.76      0.80       656

 Naive Bayes test accuracy:  0.76

#******************          Mlp Classifier       ********************
from sklearn.neural_network import MLPClassifier
mpc=MLPClassifier(hidden_layer_sizes=(5,2), max_iter=1000)
mpc.fit(df_num_train, y_train)
y_predm=mpc.predict(df_num_test)
print ("Mlp Classifier  test accuracy: ", mpc.score(df_num_test, y_test))
print (classification_report(y_test,adsk.predict(df_num_test)))
print("knn(k=5) test accuracy;", adsk.score(df_num_test  , y_test))

Mlp Classifier  test accuracy:  0.9634146341463414
              precision    recall  f1-score   support

           0       0.88      0.81      0.84        79
           1       0.97      0.98      0.98       577

    accuracy                           0.96       656
   macro avg       0.93      0.90      0.91       656
weighted avg       0.96      0.96      0.96       656

#******************    <    logistic Regreession Classifier   >     ************************
from sklearn.linear_model import LogisticRegression
adsr=LogisticRegression(solver='1bfgs')
adsr.fit(y_train.ravel())
y_predr=adsr.predict(y_test)

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-167-29cb39e45767> in <module>
      3 from sklearn.linear_model import LogisticRegression
      4 adsr=LogisticRegression(solver='1bfgs')
----> 5 adsr.fit(y_train.ravel())
      6 y_predr=adsr.predict(y_test)
      7 


TypeError: fit() missing 1 required positional argument: 'y'

#**********************        knn          ***************************
from sklearn.neighbors import KNeighborsClassifier
k=5
adsk=KNeighborsClassifier(n_neighbors=k)
adsk.fit(df_num_train,y_train)
y_predk=adsk.predict(df_num_test)
print("when k={} neighnors , knn test accuracy:{}".format(k , adsk.score(df_num_test ,y_test)))
print("when k={} neighnors , knn test accuracy:{}".format(k , adsk.score(df_num_train ,y_train)))
print(classification_report(y_test,adsk.predict(df_num_test)))
print("knn(k=5) test accuracy;",adsk.score(df_num_test, y_test))
ran=np.arange(1,30)
train_list=[]
test_list=[]
for i  , each in enumerate(ran):
    adsk=KNeighborsClassifier(n_neighbors=each)
    # adsk.fit(df_num_train,y_train)
    adsk.fit(df_num_train,y_train)
    
    test_list.append(adsk.score(df_num_test, y_test))
    train_list.append(adsk.score(df_num_train , y_train))
print("when k={} neighnors , knn test accuracy:{}".format(np.max(test_list) ,test_list.index (np.max(test_list))+1))
print("when k={} neighnors , knn test accuracy:{}".format(np.max(train_list) ,train_list.index (np.max(train_list))+1))
plt.figure(figsize=[15,15])
plt.plot(ran,test_list,label='test score')
plt.plot(ran,train_list,label='train score')
plt.xlabel('number of neighbers')
plt.ylabel('numer/count')
plt.xticks(ran)
plt.legend()
plt.show()

when k=5 neighnors , knn test accuracy:0.9527439024390244
when k=5 neighnors , knn test accuracy:0.9672131147540983
              precision    recall  f1-score   support

           0       0.96      0.63      0.76        79
           1       0.95      1.00      0.97       577

    accuracy                           0.95       656
   macro avg       0.96      0.81      0.87       656
weighted avg       0.95      0.95      0.95       656

knn(k=5) test accuracy; 0.9527439024390244
when k=0.9573170731707317 neighnors , knn test accuracy:4
when k=0.9939001143728555 neighnors , knn test accuracy:1

#****************              logistic Regreession Classifier              *****************
from sklearn.linear_model import LogisticRegression
#adsr=LogisticRegression(solver='1bfgs')
#adsr.fit(df_num_train,y_train())
#y_predr=adsr.predict(y_test)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
#x_train, x_test, y_train, y_test = train_test_split(df_num, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(df_num_train, y_train)
y_pred = logreg.predict(df_num_test)
print(classification_report(y_test,logreg.predict(df_num_test)))
print('Accuracy of logistic regression classifier on test set:{:.2f}'.format(logreg.score(df_num_test, y_test)))

              precision    recall  f1-score   support

           0       0.98      0.80      0.88        79
           1       0.97      1.00      0.99       577

    accuracy                           0.97       656
   macro avg       0.98      0.90      0.93       656
weighted avg       0.97      0.97      0.97       656

Accuracy of logistic regression classifier on test set:0.97

semnan-university-ai / internet-advertisements Goto Github PK

internet-advertisements's Introduction

Author : Amir Shokri

github link : https://github.com/amirshnll/Internet-Advertisements

dataset link : http://archive.ics.uci.edu/ml/datasets/Internet+Advertisements

email : [email protected]

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent