%load_ext watermark
%watermark -v -u -n -t -z -a 'Samira Kumar' -p numpy,pandas,networkx,scipy,matplotlib,statsmodels
import pandas as pd
import numpy as np
df=pd.DataFrame(pd.read_csv('products.csv'))
df.head()
df_filter=df[(df['group']=='Book')&((df['salesrank']<=150000)&(df['salesrank']>-1))]
df_cop=pd.DataFrame(pd.read_csv('copurchase.csv'))
df_cop_book=df_cop[df_cop.Source.isin(df_filter.id) & df_cop.Target.isin(df_filter.id)]
in_degree=df_cop_book.groupby(['Target'])['Source'].size().reset_index(name='in_degree')
out_degree=df_cop_book.groupby(['Source'])['Target'].size().reset_index(name='out_degree')
x = out_degree.set_index('Source')
y = in_degree.set_index('Target').rename_axis('Source')
y.columns = x.columns
combined=y.add(x, fill_value=0).loc[y.index, :].reset_index()
df_filter.head()
combined.nlargest(5,'out_degree') #Top 5 products with highest in+out degree
#2 products (33 and 4429) with highest degree.
#grouping by source and target products to see if any particular combination has been repeated.
#We'll use this dataset to build the network graph
df_final_group=df_cop_book.groupby(['Source','Target']).size().reset_index(name='Freq')
df_final_group.head()
The product with the highest degree (in degree + out degree) are 33 and 4429. We’re interested in the sub component off all the products that are directly or indirectly associated with products 33 and 4429. The nodes 33 and 4429 and all its subcomponents were visualized by using a package Networkx. In the graph, larger the size of node, larger the degree for the node and darker color means larger degree. The degree of all nodes varies from 1-53. (A clear picture is attached with the assignment).
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import pylab
import community
from matplotlib.pyplot import subplots
%matplotlib inline
g = nx.from_pandas_edgelist(df_final_group, 'Source', 'Target', ['Freq'])#create_using=nx.Graph()
fig, ax = subplots()
subnet = nx.node_connected_component(g, 4429)
pos=nx.kamada_kawai_layout(g.subgraph(subnet))
cmapC = plt.cm.get_cmap('Spectral')
degrees = dict(g.subgraph(subnet).degree()) #Dict with Node ID, Degree
nodes = dict(g.subgraph(subnet).nodes())
n_color = np.asarray([degrees[n] for n in nodes])
edges = dict(g.subgraph(subnet).edges())
weights = [g.subgraph(subnet)[u][v]['Freq'] for u,v in edges]
colors=range(53)
vmin = min(colors)
vmax = max(colors)
draw=nx.draw_kamada_kawai(g.subgraph(subnet),k=1.2, with_labels = False,
nodelist=degrees.keys(),node_size=[v*50 for v in degrees.values()]
,cmap=cmapC,width=weights,arrows=True,node_color=n_color,vmin=vmin, vmax=vmax)
plt.xticks([], [])
plt.yticks([], [])
fig = plt.gcf()
fig.set_size_inches(50, 50)
sm = plt.cm.ScalarMappable(cmap=cmapC, norm=plt.Normalize(vmin=vmin, vmax=vmax))
sm._A = []
cbar=plt.colorbar(sm,aspect=40)
cbar.ax.tick_params(labelsize=30)
plt.figure(dpi=1200)
plt.show()
Degree of the Nodes:
A histogram look at the degree distribution of the nodes. Degree of a node determines the no of nodes the parent node is connected directly. We can see that the large number of nodes have lower degree (<10) while only few nodes have degree >10.
## Degree Histogram for sub-component
import collections
from bokeh.io import show, output_file
from bokeh.plotting import figure
import seaborn as sns
degree_sequence = sorted([d for n, d in g.subgraph(subnet).degree()], reverse=True)
degreeCount = collections.Counter(degree_sequence)
deg, cnt = zip(*degreeCount.items())
fig, ax = plt.subplots()
plt.bar(deg, cnt, width=0.8, color='r')
plt.title("Degree Histogram")
plt.ylabel("Count")
plt.xlabel("Degree")
fig = plt.gcf()
fig.set_size_inches(12,8)
plt.show()
#Density of sub-component
density_sc=nx.density(g.subgraph(subnet))
density_sc
#Low value of density means less dense the nodes are and little collectiveness between the nodes.
Degree Centrality of a nodes gives us the fraction of nodes each node it is connected to. Since 2 nodes have highest degree (53), the degree centrality of these nodes is the highest, followed by other nodes with higher degree. We can also see that there are nodes which have similar degree centrality.
#Centrality of the Nodes.
degree_central=nx.degree_centrality(g.subgraph(subnet))
plt.bar(range(len(degree_central)), list(degree_central.values()),width=0.8, color='r')
# plt.xticks(range(len(degree_central)), list(degree_central.keys()))
fig = plt.gcf()
axes=plt.gca()
fig.set_size_inches(15,10)
axes.set_ylim([0,0.06])
plt.title("Centrality of Each node")
plt.show()
#degree centrality=fraction of nodes it is connected to. 2 nodes have highest centrality
Closeness centrality is defined by is the average length of the shortest path between the node and all other nodes in the graph. Thus the more central a node is, the closer it is to all other nodes. For this particular subgraph, the closeness centrality looks similar for most of the nodes.
#Closeness Centrality of the Nodes.
close_degree_central=nx.closeness_centrality(g.subgraph(subnet))
plt.bar(range(len(close_degree_central)), list(close_degree_central.values()),width=0.8, color='r')
# plt.xticks(range(len(degree_central)), list(degree_central.keys()))
fig = plt.gcf()
axes=plt.gca()
fig.set_size_inches(15,10)
plt.title("Closeness Centrality of each node")
plt.show()
#Closeness Centrality-Closeness centrality of a node u is the reciprocal of the
#average shortest path distance to u over all n-1 reachable nodes.
Betweenness centrality aims to find the vertex of the given graph. Betweenness centrality quantifies the number of times a node acts as a bridge along the shortest path between two other nodes. Betweenness centrality of a node v is the sum of the fraction of all-pairs shortest paths that pass- through v. In the below graph, we can see that few nodes act as the bridge for other nodes to be connected.
#Between Centrality of the Nodes.
between_degree_central=nx.betweenness_centrality(g.subgraph(subnet))
plt.bar(range(len(between_degree_central)), list(between_degree_central.values()),width=0.8, color='r')
fig = plt.gcf()
axes=plt.gca()
fig.set_size_inches(15,10)
plt.title("Betweeness Centrality of each node")
plt.show()
#Between Centrality-Compute the shortest-path betweenness centrality for nodes.
#Betweenness centrality of a node v is the sum of the fraction of all-pairs shortest paths that pass through v
Eigenvector centrality computes the centrality for a node based on the centrality of its neighbors. It is a relative scores to all nodes in the network based on the concept that connections to high-scoring nodes contribute more to the score of the node in question than equal connections to low-scoring nodes. In the subgraph, 2 nodes have high eigen vector score. Many other nodes have same score of 0.1, indicating that those nodes have similar in value.
#Eigen Value Centrality of the Nodes.
Eigen_central=nx.eigenvector_centrality_numpy(g.subgraph(subnet))
plt.bar(range(len(Eigen_central)), list(Eigen_central.values()),width=0.8, color='r')
fig = plt.gcf()
axes=plt.gca()
fig.set_size_inches(15,10)
plt.title("Eigen Vector Centrality of each node")
plt.show()
#Eigenvector centrality computes the centrality for a node based on the centrality of its neighbors.
#The eigenvector centrality for node i is the i-th element of the vector xdefined by the equation
Hub score estimates the nodes value based on outgoing links. The authority score indicates the node value based on incoming links. The hub score and Authority score of the subgraph is same for all nodes, indicating that the no of incoming and outgoing nodes are same.
#Hub Score of the Nodes.
hits_score=nx.hits_numpy(g.subgraph(subnet))
plt.bar(range(len(hits_score[0])), list(hits_score[0].values()),width=0.8, color='r')
fig = plt.gcf()
axes=plt.gca()
fig.set_size_inches(15,10)
plt.title("Hub Score for each node")
plt.show()
#Hub estimates the node value based on the Outgoing links.
#Authority Score of the Nodes.
hits_score=nx.hits_numpy(g.subgraph(subnet))
plt.bar(range(len(hits_score[1])), list(hits_score[1].values()),width=0.8, color='r')
fig = plt.gcf()
axes=plt.gca()
fig.set_size_inches(15,10)
plt.title("Authority Score for each node")
plt.show()
#Authorities estimates the node value based on the incoming links.
Average degree of neighbor indicates the average degree of the neighbor for a given node. This gives us a good indication of the degree of the neighboring nodes for a given node. In the below graph, we can see that nodes only connected to 33 or 4429 directly have average degree 53 but the dispersion of the average degree is high.
#Average Degree Neighbour
degree_assort=nx.average_neighbor_degree(g.subgraph(subnet))
plt.bar(range(len(degree_assort)), list(degree_assort.values()),width=0.8, color='r')
fig = plt.gcf()
axes=plt.gca()
fig.set_size_inches(15,10)
plt.title("Average degree neighbour for each node")
plt.show()
#Returns the average degree of the neighborhood of each node.
The shortest distance/path length between the two most distant connected nodes in the network (the longest of all the calculated path lengths). In the subgraph, the diameter is 41.
#Diameter of the network graph
nx.diameter(g.subgraph(subnet))
#The shortest distance/path length between the two most distant connected nodes in the
#network ( = the longest of all the calculated path lengths).
We calculated overall average for Rating, Salesrank and Review Count for the subgraph.
filter_4429=df_final_group[(df_final_group['Source']==4429)|(df_final_group['Target']==4429)]
merged=pd.merge(filter_4429,df_filter,how='left', left_on=['Source'],right_on=['id'])
merged.loc[merged.id==4429,'title']='The Narcissistic Family : Diagnosis and Treatment'
merged.loc[merged.id==4429,'salesrank']=9727
merged.loc[merged.id==4429,'review_cnt']=19
merged.loc[merged.id==4429,'downloads']=19
merged.loc[merged.id==4429,'rating']=5.0
merged.loc[merged.id==4429,'id']=2501
merged[['rating','salesrank','review_cnt']].mean()
Since the neighbor (source/target) for each product node can have an influence on the product purchase, we're finding the salesrank, rating and review count for each product.
Example: If product 33 is associated with 55 and 66, then both 55 and 66 influence the purchase of 33. So we find the total salesrank, total reviews and rating for each product associated with each product. So for products 33 and 4429, we know they've highest degree (53). So for them we'd find salesrank, rating and review for the 53 products associated with them. This is done for all products in source and target.
The programming logic is simple, for source, we consider target and get salesrank of all the source products. We the consider source and get salesrank for all target products. We then groupby source and sum up all target salesrank. Then groupby target and sum up source targets. So for each product, we ge the total salesrank of all the products associated with it as either source or target.
Consider below example:
source-target-salesrank_source-salesrank_target
33-55-5-10
33-66-5-15
44-33-20-5
id-Source_sum-Target_Sum
33-25-20
44-5-0
55-5-0
66-5-0
33 is source for 2 products. So their salesrank sum is 10+15=25 and 33 is target for one node. So its total salesrank is 20. So total salesrank for all products associated with 33 is 45 (10+15+20).
To find salesrank sum of neighbors of product 33, we first get salesrank of products 55 and 66 and store them as salesrank_target since they are in target column. Then we get salesrank of 44 and store them as salesrank_source. So now when we groupby source and sum salesrank_target, we store it as Source_sum. Then we groupby target and store the salesrank_source sum as Target_sum. We then add both Source_sum and Target_sum to get total salesrank for associated products for each product. Here's the catch. For some products, it can be present in only source/target. So if they're missing in either source/target, we consider them as 0 and make the combined dataframe.
Same process is followed for rating and review count.
#Get the edges for each nodes from the network graph and store them in edges dataframe. From the initial table,
# we get the salesrank, rating and review detail for all the products.
h=g.subgraph(subnet)
edges=pd.DataFrame(list(h.edges()))
edges=edges.rename(columns={'0':'Source','1':'Target'})
#total salesrank
df_cop_sales=pd.merge(edges,df_filter[['id','salesrank']],left_on=1,right_on='id',how='left')
df_cop_sales=df_cop_sales.rename(columns={'salesrank':'salesrank_target'})
df_cop_sale=pd.merge(edges,df_filter[['id','salesrank']],left_on=0,right_on='id',how='left')
df_cop_sale=df_cop_sale.rename(columns={'salesrank':'salesrank_source'})
df_cop_sale['salesrank_target']=df_cop_sales['salesrank_target']
df_source_sum=df_cop_sale.groupby(0)['salesrank_target'].sum().reset_index(name='Source_sum')
df_target_sum=df_cop_sale.groupby(1)['salesrank_source'].sum().reset_index(name='Target_sum')
x = df_source_sum.set_index(0)
y = df_target_sum.set_index(1).rename_axis(0)
y.columns = x.columns
combined=y.add(x, fill_value=0)
combined=pd.DataFrame(combined)
combined=combined.rename(columns={'0':'id','Source_sum':'Total_salesrank'})
combined.head()
#Total rating
df_cop_rating=pd.merge(edges,df_filter[['id','rating']],left_on=1,right_on='id',how='left')
df_cop_rating=df_cop_rating.rename(columns={'rating':'rating_target'})
df_cop_rate=pd.merge(edges,df_filter[['id','rating']],left_on=0,right_on='id',how='left')
df_cop_rate=df_cop_rate.rename(columns={'rating':'rating_source'})
df_cop_rate['rating_target']=df_cop_rating['rating_target']
df_source_rate_sum=df_cop_rate.groupby(0)['rating_target'].sum().reset_index(name='Source_sum')
df_target_rate_sum=df_cop_rate.groupby(1)['rating_source'].sum().reset_index(name='Target_sum')
x = df_source_rate_sum.set_index(0)
y = df_target_rate_sum.set_index(1).rename_axis(0)
y.columns = x.columns
combined_rating=y.add(x, fill_value=0)
combined_rating=pd.DataFrame(combined_rating)
combined_rating=combined_rating.rename(columns={'0':'id','Source_sum':'Total_rating'})
combined_rating.head()
#Total review count
df_cop_reviews=pd.merge(edges,df_filter[['id','review_cnt']],left_on=1,right_on='id',how='left')
df_cop_reviews=df_cop_reviews.rename(columns={'review_cnt':'review_cnt_target'})
df_cop_review=pd.merge(edges,df_filter[['id','review_cnt']],left_on=0,right_on='id',how='left')
df_cop_review=df_cop_review.rename(columns={'review_cnt':'review_cnt_source'})
df_cop_review['review_cnt_target']=df_cop_reviews['review_cnt_target']
df_source_review_sum=df_cop_review.groupby(0)['review_cnt_target'].sum().reset_index(name='Source_sum')
df_source_reviews_sum=df_cop_review.groupby(1)['review_cnt_source'].sum().reset_index(name='Target_sum')
x = df_source_review_sum.set_index(0)
y = df_source_reviews_sum.set_index(1).rename_axis(0)
y.columns = x.columns
combined_reviews=y.add(x, fill_value=0)
combined_reviews=pd.DataFrame(combined_reviews)
combined_reviews=combined_reviews.rename(columns={'0':'id','Source_sum':'Total_review_cnt'})
combined_reviews.head()
By finding out the degree for each node, we can find the average salesrank, rating and review count for each product.
degrees=pd.DataFrame(list(h.degree()))
degrees=degrees.sort_values(0)
degrees.head()
df_merge_sales=pd.merge(combined,degrees,left_on=0,right_on=0,how='left')
df_merge_sales['Average_Salesrank']=df_merge_sales['Total_salesrank']/df_merge_sales[1]
df_merge_sales=df_merge_sales.rename(columns={'key_0':'id',1:'Degree'})
df_merge_sales.head()
df_merge_rating=pd.merge(combined_rating,degrees,left_on=0,right_on=0,how='left')
df_merge_rating['Average_Rating']=df_merge_rating['Total_rating']/df_merge_rating[1]
df_merge_rating=df_merge_rating.rename(columns={'key_0':'id',1:'Degree'})
df_merge_rating.head()
df_merge_reviews=pd.merge(combined_reviews,degrees,left_on=0,right_on=0,how='left')
df_merge_reviews['Average_Reviews']=df_merge_reviews['Total_review_cnt']/df_merge_reviews[1]
df_merge_reviews=df_merge_reviews.rename(columns={'key_0':'id',1:'Degree'})
df_merge_reviews.head()
From the network graph, we're finding each parameters for each node and adding them to the dataframe.
centrality_df=pd.DataFrame(degree_central.items(),columns=['id','centrality'])
degree_df=pd.DataFrame(degrees.items(),columns=['id','degree'])
closeness_central_df=pd.DataFrame(close_degree_central.items(),columns=['id','closeness_centrality'])
between_degree_df=pd.DataFrame(between_degree_central.items(),columns=['id','between_centrality'])
Eigen_central_df=pd.DataFrame(Eigen_central.items(),columns=['id','eigen_centrality'])
hubs_score_df=pd.DataFrame(hits_score[0].items(),columns=['id','hub_score'])
authority_score_df=pd.DataFrame(hits_score[1].items(),columns=['id','authority_score'])
avg_degree_neighbour_df=pd.DataFrame(degree_assort.items(),columns=['id','avg_degree_neighbour'])
centrality_df.head()
#filter purchase dataset
df_filter_books=df_filter[(df_filter.id.isin(centrality_df.id)) & (df_filter.id.isin(degree_df.id))
&(df_filter.id.isin(between_degree_df.id))&(df_filter.id.isin(Eigen_central_df.id))
&(df_filter.id.isin(hubs_score_df.id))&(df_filter.id.isin(authority_score_df.id))
&(df_filter.id.isin(avg_degree_neighbour_df.id))]
df_filter_books.head()
df_filter_books=pd.merge(df_filter_books,centrality_df,left_on='id',right_on='id',how='left')
df_filter_books=pd.merge(df_filter_books,degree_df,left_on='id',right_on='id',how='left')
df_filter_books=pd.merge(df_filter_books,closeness_central_df,left_on='id',right_on='id',how='left')
df_filter_books=pd.merge(df_filter_books,between_degree_df,left_on='id',right_on='id',how='left')
df_filter_books=pd.merge(df_filter_books,Eigen_central_df,left_on='id',right_on='id',how='left')
df_filter_books=pd.merge(df_filter_books,hubs_score_df,left_on='id',right_on='id',how='left')
df_filter_books=pd.merge(df_filter_books,authority_score_df,left_on='id',right_on='id',how='left')
df_filter_books=pd.merge(df_filter_books,avg_degree_neighbour_df,left_on='id',right_on='id',how='left')
df_filter_books=pd.merge(df_filter_books,df_merge_sales[['key_0','Average_Salesrank']],left_on='id',right_on='key_0',how='left')
df_filter_books=pd.merge(df_filter_books,df_merge_rating[['key_0','Average_Rating']],left_on='id',right_on='key_0',how='left')
df_filter_books=pd.merge(df_filter_books,df_merge_reviews[['key_0','Average_Reviews']],left_on='id',right_on='key_0',how='left')
df_filter_books=df_filter_books.drop('key_0_x',1)
df_filter_books=df_filter_books.drop('key_0_y',1)
df_filter_books=df_filter_books.drop('key_0',1)
df_filter_books.head()
df_log=df_filter_books
df_log['review_cnt']=np.log(df_log['review_cnt']+1)
df_log['downloads']=np.log(df_log['downloads']+1)
df_log['rating']=np.log(df_log['rating']+1)
df_log['centrality']=np.log(df_log['centrality']+1)
df_log['degree']=np.log(df_log['degree']+1)
df_log['closeness_centrality']=np.log(df_log['closeness_centrality']+1)
df_log['between_centrality']=np.log(df_log['between_centrality']+1)
df_log['eigen_centrality']=np.log(df_log['eigen_centrality']+1)
df_log['hub_score']=np.log(df_log['hub_score']+1)
df_log['authority_score']=np.log(df_log['authority_score']+1)
df_log['avg_degree_neighbour']=np.log(df_log['avg_degree_neighbour']+1)
df_log['Average_Salesrank']=np.log(df_log['Average_Salesrank']+1)
df_log['Average_Rating']=np.log(df_log['Average_Rating']+1)
df_log['Average_Reviews']=np.log(df_log['Average_Reviews']+1)
# df_dummy['nghb_mn_rating_y']=np.log(df_dummy['nghb_mn_rating_y']+1)
# df_dummy['nghb_mn_review']=np.log(df_dummy['nghb_mn_review']+1)
df_log.tail()
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.families import Poisson,Binomial
X = df_log[['review_cnt','downloads','rating','centrality','between_centrality','closeness_centrality','eigen_centrality','authority_score'
,'avg_degree_neighbour','Average_Salesrank','Average_Rating','Average_Reviews']]
X = sm.add_constant(X)
y = df_log.salesrank
poisson_model = smf.poisson('salesrank ~ review_cnt+downloads+ rating+centrality+between_centrality+closeness_centrality+eigen_centrality+authority_score + avg_degree_neighbour+ Average_Salesrank+Average_Rating+Average_Reviews', df_log)
res=poisson_model.fit(method='bfgs')
res.summary()
y_pred = res.predict(X)
from sklearn.metrics import mean_squared_error,mean_absolute_error
mse = mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
print mse,mae
print('Parameters: ', res.params)
print('T-values: ', res.tvalues)
from sklearn.metrics import accuracy_score,r2_score
errors = abs(y_pred - y)
print('Variance score: %.2f' % r2_score(y, y_pred))
print('Mean Absolute Error:', round(np.mean(errors), 2), 'salesrank.')
mape = 100 * (errors / y)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')
y_pred[0:5],y[0:5]
fig=plt.gcf()
fig.set_size_inches(10,8)
# plt.scatter(y,y_pred)
# plt.title('Train dataset Real vs. Predicted Values')
# plt.show()
import seaborn as sns
sns.regplot(y,y_pred)
plt.show()
fig=plt.gcf()
fig.set_size_inches(10,8)
X = sm.add_constant(X)
model = sm.GLM(y, X, family=sm.families.Poisson()).fit()
sns.regplot(df_log['salesrank'], model.resid_deviance, fit_reg=False)
plt.title('Residual plot')
plt.xlabel('Salesrank')
plt.ylabel('Residuals')
The value of intercept has a meaning here since a product can be purchased alone and it can have a sales rank even if it doesn’t have any co-product. P-values of all the variables are less than 0.05, indicating that they’re significant in the model. As seen in the R model, the variables used are significant in predicting the salesrank. The model was initially built with large no of variables and the insignificant variables were later dropped from the model. The authority score has the highest coefficient, indicating that for every value increase in authority score, the salesrank tends to increase. The authority score is no of incoming nodes. So products which are purchased as target product tends to have higher sales rank than products that are purchased as source product. The final model’s mean absolute deviation value is about 210.55 and mean of residuals is 28885 indicating a good fit of the parameters.