The Wikipedia Clickstream dataset contains counts of (referer, resource) pairs extracted from the request logs of Wikipedia. A referer is an HTTP header field that identifies the address of the webpage that linked to the resource being requested. The data shows how people get to a Wikipedia article and what links they click on. Documentation here.
This exploration takes the .tsv for English Wikipedia from March 2020 that is available here. There are other languages available.
import pandas as pd
import re
df = pd.read_csv('../clickstream-enwiki-2020-03.tsv', delimiter='\t', header=None, names=['prev', 'curr', 'type', 'n'], usecols=[0, 1, 2, 3])
df.head()
df.groupby('curr').sum().sort_values('n', ascending=False)[:5]
df.groupby('prev').sum().sort_values('n', ascending=False)[:5]
outgoingWPMain = df.loc[(df['prev'] == 'Main_Page')]
outgoingWPMain.sort_values('n', ascending=False)[:5]
coronaDf = df.loc[(df['prev'] == '2019–20_coronavirus_pandemic') | (df['curr'] == '2019–20_coronavirus_pandemic')]
coronaDf.sort_values('n', ascending=False)
exportCov = coronaDf
exportCov.columns = ["source", "target", "type", "value"]
exportCov = exportCov.sort_values('value', ascending=False)
targetsAlsoSources = []
for i, row in enumerate(exportCov.itertuples(), 1):
#row[2] is target, [1] is source, [0] is index
if exportCov.loc[exportCov['source'] == row[2]].count()['source'] > 0 and row[2] != '2019–20_coronavirus_pandemic':
targetsAlsoSources.append(row[2])
exportCov.loc[exportCov['target'].isin(targetsAlsoSources), 'target'] = exportCov['target'] + " *"
exportCov.to_csv("InOut_2019–20_coronavirus_pandemic.tsv", index=False, sep="\t" )
exportCov[:100].to_csv("InOutTop100_2019–20_coronavirus_pandemic.tsv", index=False, sep="\t" )
exportCov
coronaDf.columns = ["prev", "curr", "type", "n"]
incomingMain = coronaDf.loc[(coronaDf['curr'] == '2019–20_coronavirus_pandemic')]
incomingMain.groupby('prev').sum().sort_values('n', ascending=False)[:10]
outgoingMain = df.loc[(df['prev'] == '2019–20_coronavirus_pandemic')]
outgoingMain.groupby('curr').sum().sort_values('n', ascending=False)[:10]
incomingMain.sum()
outgoingMain.sort_values('n', ascending=False)
outgoingMainLinks = outgoingMain.loc[(outgoingMain['type'] == 'link')]
outgoingMainLinks.sort_values('n', ascending=False)
type link means main pandemic article links to request (in the article); type other could mean a search, but also could be an incorrect referer
outgoingMain.groupby('type').sum()
coronaMainSearch = coronaDf.loc[coronaDf['type'] == 'other']
coronaMainSearch.sort_values('n', ascending=False)