## 
## make_graphs.py
##
##     authors: Pedro Conceicao, Dejan Govc, Jānis Lazovskis
##  affliation: Neuro-Topology Group, Institute of Mathematics, University of Aberdeen
##        date: June 2020
##
## description: Commands to build graphs for paper
##       usage: Change the value of 'graph' at the bottom of this file, then run 'python make_graphs.py'
##     comment: Needs at least Python 3.7
##

import numpy as np
import pandas as pd
import networkx as nx                                            # For creating some graphs
import subprocess                                                # For getting online files
import h5py                                                      # For reading online files
from scipy.sparse import coo_matrix, load_npz, save_npz, triu
from functools import reduce                                     # For combining submatrices

##
## Barabási-Albert (unidirected) graphs
##

def ba_undirected_graph(n,m):
	# Parameters: n number of vertices, m number of edges added at each step
	# Build matrix, truncating it to upper triangular to avoid undirected edges being represented as pairs of reciprocal edges.
	graph = nx.barabasi_albert_graph(n,m,seed=None)
	A = nx.to_scipy_sparse_matrix(graph,format='coo')
	A = triu(A)
	save_npz("graphs/ba-" + str(m) + ".npz", A, compressed=True)
	print('Graph saved at graphs/ba-'+str(m)+'.npz', flush=True)


##
## Barabási-Albert (directed) graphs
## take the undirected BA graph and orient edges randomly
##

def ba_directed_graph(n,m):
	# Set parameters: n number of vertices, m number of edges added at each step;
	# p0 probability of deleting the edge; p1, p2 are probabilities of orienting the edge in one of the two ways;
	# 1 - p0 - p1 - p2 is the probability of "orienting the edge in both ways", i.e. replacing it by a pair of reciprocal edges.
	p0 = 0
	p1 = 0.45
	p2 = 0.45

	# Build matrix, truncating it to upper triangular to avoid undirected edges being represented as pairs of reciprocal edges.
	# Then, randomly for each edge, either remove it or orient it in one or two ways depending on p0, p1 and p2.
	graph = nx.barabasi_albert_graph(n,m,seed=None)
	A = nx.to_scipy_sparse_matrix(graph,format='coo')
	A = triu(A)
	rmlist=[]
	relist=[]
	for i in range(len(A.row)):
		x = np.random.uniform()
		if x < p0:
			rmlist.append(i)
		elif x < p0 + p1:
			pass
		elif x < p0 + p1 + p2:
			r = A.row[i]
			A.row[i] = A.col[i]
			A.col[i] = r
		else:
			relist.append(i)
	A.row = np.append(A.row,A.col[relist])
	A.col = np.append(A.col,A.row[relist])
	A.data = np.append(A.data,A.data[relist])
	A.row = np.delete(A.row,rmlist)
	A.col = np.delete(A.col,rmlist)
	A.data = np.delete(A.data,rmlist)
	save_npz("graphs/dba-" + str(m) + ".npz", A, compressed=True)
	print('Graph saved at graphs/dba-'+str(m)+'.npz', flush=True)


##
## Block graphs
##

def block_graph(b,A,i):
	# Set parameters: b is the number of vertices in a block, A is a (square) connection probability matrix, i is an index to output different files
	# Build matrices of corresponding block graphs.
	n = len(A)
	B = np.zeros([n*b,n*b],dtype=int)
	for s in range(n):
		for t in range(n):
			for k in range(b):
				for l in range(b):
					if np.random.uniform() < A[s,t] and s*b+k != t*b+l:
						B[s*b+k,t*b+l] = 1
	B = coo_matrix(B)
	save_npz("graphs/block" + str(i) + ".npz", B, compressed=True)
	print('Graph saved at graphs/block-'+str(i)+'.npz', flush=True)


##
## BlueBrain microcircuit
## https://bbp.epfl.ch/nmc-portal/web/guest/downloads
##

def bb_subgraphs(layer=0):
	# Download and extract file
	try:
		mc2 = h5py.File('cons_locs_pathways_mc2_Column.h5','r')
	except:
		download = subprocess.Popen(['wget','https://bbp.epfl.ch/public/nmc-models/pathways_stripped_prod/average/cons_locs_pathways_mc2_Column.tar'],stdout=subprocess.DEVNULL)
		download.wait()
		extract = subprocess.Popen(['tar','-xvf','cons_locs_pathways_mc2_Column.tar','cons_locs_pathways_mc2_Column.h5'],stdout=subprocess.DEVNULL)
		extract.wait()
		mc2 = h5py.File('cons_locs_pathways_mc2_Column.h5','r')

	# Save chosen layer
	if layer != 0:
		sublayers = []
		for layer_from in [l for l in list(mc2['connectivity']) if l[1]==str(layer)]:
			subsublayers = []
			for layer_to in [l for l in list(mc2['connectivity']) if l[1]==str(layer)]:
				subsublayers.append(np.array(list(mc2['connectivity'][layer_from][layer_to]['cMat'])))
			sublayers.append(reduce((lambda x,y: np.concatenate((x,y),axis=1)),subsublayers))
		matrix_dense = reduce((lambda x,y: np.concatenate((x,y))), sublayers)
		matrix_sparse = np.nonzero(matrix_dense)
		save_npz('graphs/mc2_layer'+str(layer)+'.npz', coo_matrix((np.ones(len(matrix_sparse[0])), matrix_sparse), shape=(len(matrix_dense),len(matrix_dense))))
		print('Graph saved at graphs/mc2_layer'+str(layer)+'.npz', flush=True)

	# Save complete circuit
	else:
		sublayers = []
		for layer_from in list(mc2['connectivity']):
			subsublayers = []
			for layer_to in list(mc2['connectivity']):
				subsublayers.append(np.array(list(mc2['connectivity'][layer_from][layer_to]['cMat'])))
			sublayers.append(reduce((lambda x,y: np.concatenate((x,y),axis=1)),subsublayers))
		matrix_dense = reduce((lambda x,y: np.concatenate((x,y),axis=1)), sublayers)
		matrix_sparse = np.nonzero(matrix_dense)
		save_npz('graphs/mc2.npz', coo_matrix((np.ones(len(matrix_sparse[0])), matrix_sparse), shape=(len(matrix_dense),len(matrix_dense))))
		print('Graph saved at graphs/mc2.npz', flush=True)


##
## Erdos-Renyi graphs
##

def er_graphs(p):
	# Create the graph 
	G = nx.erdos_renyi_graph(500, p, seed=None, directed=True) 
	# Convert it ot coo-format
	M = nx.to_scipy_sparse_matrix(G, nodelist=None, dtype=np.intc, weight=None, format='coo')
	# Save it in npz format
	save_npz('graphs/er-' + str(p).replace('.','') + '.npz', M, compressed=True)
	# Confirm that the procedure worked and pay tribute to George Boole :)
	print('Graph saved at graphs/er' + str(p).replace('.','') + '.npz. Thanks George Boole',flush=True)
 
##
## ``Real-world'' graphs
## http://konect.uni-koblenz.de/networks/
## http://konect.cc/
## Google+ database

def realworld_graphs():
	# Set dictionary for graph names
	graphs = {'congress':'convote', 'dblp':'dblp-cite', 'gnutella':'p2p-Gnutella09', 'gplus':'ego-gplus', 'wiki':'wiki_talk_br'}
	sizes = {'congress':219, 'dblp':12590, 'gnutella':8114, 'gplus':23628, 'wiki':1181}
	# Download and extract files
	for g in graphs.keys():
		print('Making '+g+' graph',flush=True)
		try:
			f = open('out.'+graphs[g],'r')
		except:
			print('Downloading...',flush=True)
			download = subprocess.Popen(['wget', 'http://konect.cc/files/download.tsv.'+graphs[g]+'.tar.bz2'],stdout=subprocess.DEVNULL)
			download.wait()
			print('Extracting...',flush=True)
			extract = subprocess.Popen(['tar','--strip-components=1','-xvjf','download.tsv.'+graphs[g]+'.tar.bz2',graphs[g]+'/out.'+graphs[g]],stdout=subprocess.DEVNULL)
			extract.wait()
			f = open('out.'+graphs[g],'r')

		# Read file
		L = f.readlines()
		f.close()
		while L[0][0] == '%':
			L = L[1:]

		# Assemble rows and colums
		rows = []
		cols = []
		edges = []
		for edge in L[:-1]:
			vertices = [int(x)-1 for x in edge.split()[:2]]
			if vertices[0] != vertices[1]:
				if vertices not in edges:
					rows.append(vertices[0])
					cols.append(vertices[1])
					edges.append(vertices)
		save_npz('graphs/'+g+'.npz',coo_matrix((np.ones(len(rows),dtype='int64'), (np.array(rows),np.array(cols))), shape=(sizes[g],sizes[g])))
		print('Graph saved at graphs/'+g+'.npz', flush=True)


##
## Scale-free graphs
##

def scalefree_graph(N,a,c,di,do):
	# a: probability of adding an edge and an in-vertex
	# c: probability of adding an edge and an out-vertex
	# b = 1-a-c probability of adding an edge between existing vertices
	# di, do: non-negative reals
	# N: number of steps.

	# Build matrix as described in "Directed Scale-Free Graphs" paper.
	# The initial graph is a directed 3-cycle.
	# Once the graph is generated, we remove loops and multiple edges.
	b = 1-a-c
	rows = np.array([0,1,2])
	cols = np.array([1,2,0])
	n = 3
	for t in range(3,N):
		outdeg = np.bincount(rows,minlength=n)
		indeg = np.bincount(cols,minlength=n)
		outprob = [(d+do)/(t+do*n) for d in outdeg]
		inprob = [(d+di)/(t+di*n) for d in indeg]
		x = np.random.uniform()
		yo = np.random.uniform()
		yi = np.random.uniform()
		so = 0
		si = 0
		v0 = n
		w0 = n
		for v in range(n):
			so += outprob[v]
			if yo < so:
				v0 = v
				break
		for w in range(n):
			si += inprob[w]
			if yi < si:
				w0 = w
				break
		if x < a:
			rows = np.append(rows,n)
			cols = np.append(cols,w0)
			n += 1
		elif x < a + b:
			rows = np.append(rows,v0)
			cols = np.append(cols,w0)
		else:
			rows = np.append(rows,v0)
			cols = np.append(cols,n)
			n += 1
	A = np.zeros([n,n],dtype=int)
	for t in range(N):
		if rows[t] != cols[t]:
			A[rows[t],cols[t]] = 1
	A = coo_matrix(A)
	save_npz("graphs/dsf-graph.npz", A, compressed=True)
	print('Graph saved at graphs/dsf-graph.npz', flush=True)


##
## Generate the desired graphs
##

if __name__=="__main__":
	graph = 'realworld'

	# Create new folder for graphs if it doesn't exist
	#try:
	#	folder = subprocess.Popen(['mkdir','graphs'],stderr=subprocess.DEVNULL)
	#except:
	#	pass

	# Undirected Barabási-Albert graphs
	if graph == 'ba_undirected':
		for edges_per_step in [5,10,15,20,25]:
			ba_undirected_graph(500,edges_per_step)

	# Directed Barabási-Albert graphs
	elif graph == 'ba_directed':
		for edges_per_step in [5,10,15,20,25]:
			ba_directed_graph(500,edges_per_step)

	# Block graphs
	elif graph == 'block_graph':
		p0 = [[0.1,0.1,0.1],[0.1,0.1,0.1],[0.1,0.1,0.1]]
		p1 = [[0,0.1,0.2],[0.1,0,0.1],[0.2,0.1,0]]
		p2 = [[0.2,0.1,0],[0.1,0.2,0.1],[0,0.1,0.2]]
		p3 = [[0.2,0.1,0],[0.2,0.1,0],[0.2,0.1,0]]
		p4 = [[0,0.2,0],[0,0,0.2],[0.2,0,0]]
		for index,conn_prob in enumerate([np.array(p) for p in [p0,p1,p2,p3,p4]]):
			block_graph(100,conn_prob,index)

	# BlueBrain MC2
	elif graph == 'bluebrain':
		for bblayer in range(7):
			bb_subgraphs(bblayer)

	# Erdos-Renyi graphs
	elif graph == 'erdosrenyi':
		for prob in [ 0.01, 0.025, 0.05, 0.075, 0.1]:
			er_graphs(prob)

	# OpenFlights database
	elif graph == 'realworld':
		realworld_graphs()

	# Directed scalefree graph
	elif graph == 'scalefree':
		scalefree_graph(5000, 0.05, 0.05, 0.1, 0.2)
		scalefree_graph(10000, 0.025, 0.025, 0.2, 0.4)
		scalefree_graph(15000, 0.0167, 0.0167, 0.4, 0.8)
		scalefree_graph(20000, 0.0125, 0.0125, 0.8, 1.6)
		scalefree_graph(25000, 0.01, 0.01, 1.6, 3.2)
	else:
		print('Given string "'+graph+'" not recognized. See end of file for acceptable options.',flush=True)