Source code for gklearn.kernels.marginalizedKernel

"""
@author: linlin

@references:

	[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between 
	labeled graphs. In Proceedings of the 20th International Conference on 
	Machine Learning, Washington, DC, United States, 2003.

	[2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and 
	Jean-Philippe Vert. Extensions of marginalized graph kernels. In 
	Proceedings of the twenty-first international conference on Machine 
	learning, page 70. ACM, 2004.
"""

import sys
import time
from functools import partial
from multiprocessing import Pool
from tqdm import tqdm
tqdm.monitor_interval = 0
#import traceback

import networkx as nx
import numpy as np

from gklearn.utils.kernels import deltakernel
from gklearn.utils.utils import untotterTransformation
from gklearn.utils.graphdataset import get_dataset_attributes
from gklearn.utils.parallel import parallel_gm


[docs]def marginalizedkernel(*args,
					   node_label='atom',
					   edge_label='bond_type',
					   p_quit=0.5,
					   n_iteration=20,
					   remove_totters=False,
					   n_jobs=None,
					   chunksize=None,
					   verbose=True):
	"""Compute marginalized graph kernels between graphs.

	Parameters
	----------
	Gn : List of NetworkX graph
		List of graphs between which the kernels are computed.
	
	G1, G2 : NetworkX graphs
		Two graphs between which the kernel is computed.

	node_label : string
		Node attribute used as symbolic label. The default node label is 'atom'.

	edge_label : string
		Edge attribute used as symbolic label. The default edge label is 'bond_type'.

	p_quit : integer
		The termination probability in the random walks generating step.

	n_iteration : integer
		Time of iterations to compute R_inf.

	remove_totters : boolean
		Whether to remove totterings by method introduced in [2]. The default 
		value is False.

	n_jobs : int
		Number of jobs for parallelization.   

	Return
	------
	Kmatrix : Numpy matrix
		Kernel matrix, each element of which is the marginalized kernel between
		2 praphs.
	"""
	# pre-process
	n_iteration = int(n_iteration)
	Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()]
	Gn = [g.copy() for g in Gn]
	
	ds_attrs = get_dataset_attributes(
		Gn,
		attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
		node_label=node_label, edge_label=edge_label)
	if not ds_attrs['node_labeled'] or node_label is None:
		node_label = 'atom'
		for G in Gn:
			nx.set_node_attributes(G, '0', 'atom')
	if not ds_attrs['edge_labeled'] or edge_label is None:
		edge_label = 'bond_type'
		for G in Gn:
			nx.set_edge_attributes(G, '0', 'bond_type')

	start_time = time.time()
	
	if remove_totters:
		# ---- use pool.imap_unordered to parallel and track progress. ----
		pool = Pool(n_jobs)
		untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label)
		if chunksize is None:
			if len(Gn) < 100 * n_jobs:
				chunksize = int(len(Gn) / n_jobs) + 1
			else:
				chunksize = 100
		for i, g in tqdm(
				pool.imap_unordered(
					untotter_partial, range(0, len(Gn)), chunksize),
				desc='removing tottering',
				file=sys.stdout):
			Gn[i] = g
		pool.close()
		pool.join()

#		# ---- direct running, normally use single CPU core. ----
#		Gn = [
#			untotterTransformation(G, node_label, edge_label)
#			for G in tqdm(Gn, desc='removing tottering', file=sys.stdout)
#		]

	Kmatrix = np.zeros((len(Gn), len(Gn)))

	# ---- use pool.imap_unordered to parallel and track progress. ----
	def init_worker(gn_toshare):
				global G_gn
				G_gn = gn_toshare
	do_partial = partial(wrapper_marg_do, node_label, edge_label,
						 p_quit, n_iteration)   
	parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
				glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose)


#	# ---- direct running, normally use single CPU core. ----
##	pbar = tqdm(
##		total=(1 + len(Gn)) * len(Gn) / 2,
##		desc='Computing kernels',
##		file=sys.stdout)
#	for i in range(0, len(Gn)):
#		for j in range(i, len(Gn)):
##			print(i, j)
#			Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label,
#												   edge_label, p_quit, n_iteration)
#			Kmatrix[j][i] = Kmatrix[i][j]
##			pbar.update(1)

	run_time = time.time() - start_time
	if verbose:
		print("\n --- marginalized kernel matrix of size %d built in %s seconds ---"
			  % (len(Gn), run_time))

	return Kmatrix, run_time


def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration):
	"""Compute marginalized graph kernel between 2 graphs.

	Parameters
	----------
	G1, G2 : NetworkX graphs
		2 graphs between which the kernel is computed.
	node_label : string
		node attribute used as label.
	edge_label : string
		edge attribute used as label.
	p_quit : integer
		the termination probability in the random walks generating step.
	n_iteration : integer
		time of iterations to compute R_inf.

	Return
	------
	kernel : float
		Marginalized Kernel between 2 graphs.
	"""
	# init parameters
	kernel = 0
	num_nodes_G1 = nx.number_of_nodes(g1)
	num_nodes_G2 = nx.number_of_nodes(g2)
	# the initial probability distribution in the random walks generating step
	# (uniform distribution over |G|)
	p_init_G1 = 1 / num_nodes_G1
	p_init_G2 = 1 / num_nodes_G2

	q = p_quit * p_quit
	r1 = q

#	# initial R_inf
#	# matrix to save all the R_inf for all pairs of nodes
#	R_inf = np.zeros([num_nodes_G1, num_nodes_G2])
#
#	# Compute R_inf with a simple interative method
#	for i in range(1, n_iteration):
#		R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2])
#		R_inf_new.fill(r1)
#
#		# Compute R_inf for each pair of nodes
#		for node1 in g1.nodes(data=True):
#			neighbor_n1 = g1[node1[0]]
#			# the transition probability distribution in the random walks
#			# generating step (uniform distribution over the vertices adjacent
#			# to the current vertex)
#			if len(neighbor_n1) > 0:
#				p_trans_n1 = (1 - p_quit) / len(neighbor_n1)
#				for node2 in g2.nodes(data=True):
#					neighbor_n2 = g2[node2[0]]
#					if len(neighbor_n2) > 0:
#						p_trans_n2 = (1 - p_quit) / len(neighbor_n2)
#		
#						for neighbor1 in neighbor_n1:
#							for neighbor2 in neighbor_n2:
#								t = p_trans_n1 * p_trans_n2 * \
#									deltakernel(g1.node[neighbor1][node_label],
#												g2.node[neighbor2][node_label]) * \
#									deltakernel(
#										neighbor_n1[neighbor1][edge_label],
#										neighbor_n2[neighbor2][edge_label])
#		
#								R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][
#									neighbor2]  # ref [1] equation (8)
#		R_inf[:] = R_inf_new
#
#	# add elements of R_inf up and compute kernel.
#	for node1 in g1.nodes(data=True):
#		for node2 in g2.nodes(data=True):
#			s = p_init_G1 * p_init_G2 * deltakernel(
#				node1[1][node_label], node2[1][node_label])
#			kernel += s * R_inf[node1[0]][node2[0]]  # ref [1] equation (6)
	
	
	R_inf = {} # dict to save all the R_inf for all pairs of nodes
	# initial R_inf, the 1st iteration.
	for node1 in g1.nodes():
		for node2 in g2.nodes():
#			R_inf[(node1[0], node2[0])] = r1
			if len(g1[node1]) > 0:
				if len(g2[node2]) > 0:
					R_inf[(node1, node2)] = r1
				else:
					R_inf[(node1, node2)] = p_quit
			else:
				if len(g2[node2]) > 0:
					R_inf[(node1, node2)] = p_quit
				else:
					R_inf[(node1, node2)] = 1
			
	# compute all transition probability first.
	t_dict = {}
	if n_iteration > 1:
		for node1 in g1.nodes():
			neighbor_n1 = g1[node1]
			# the transition probability distribution in the random walks
			# generating step (uniform distribution over the vertices adjacent
			# to the current vertex)
			if len(neighbor_n1) > 0:
				p_trans_n1 = (1 - p_quit) / len(neighbor_n1)
				for node2 in g2.nodes():
					neighbor_n2 = g2[node2]
					if len(neighbor_n2) > 0:
						p_trans_n2 = (1 - p_quit) / len(neighbor_n2)
						for neighbor1 in neighbor_n1:
							for neighbor2 in neighbor_n2:
								t_dict[(node1, node2, neighbor1, neighbor2)] = \
									p_trans_n1 * p_trans_n2 * \
									deltakernel(g1.nodes[neighbor1][node_label],
												g2.nodes[neighbor2][node_label]) * \
									deltakernel(
										neighbor_n1[neighbor1][edge_label],
										neighbor_n2[neighbor2][edge_label])

	# Compute R_inf with a simple interative method
	for i in range(2, n_iteration + 1):
		R_inf_old = R_inf.copy()

		# Compute R_inf for each pair of nodes
		for node1 in g1.nodes():
			neighbor_n1 = g1[node1]
			# the transition probability distribution in the random walks
			# generating step (uniform distribution over the vertices adjacent
			# to the current vertex)
			if len(neighbor_n1) > 0:
				for node2 in g2.nodes():
					neighbor_n2 = g2[node2]
					if len(neighbor_n2) > 0:   
						R_inf[(node1, node2)] = r1
						for neighbor1 in neighbor_n1:
							for neighbor2 in neighbor_n2:
								R_inf[(node1, node2)] += \
									(t_dict[(node1, node2, neighbor1, neighbor2)] * \
									R_inf_old[(neighbor1, neighbor2)])  # ref [1] equation (8)

	# add elements of R_inf up and compute kernel.
	for (n1, n2), value in R_inf.items():
		s = p_init_G1 * p_init_G2 * deltakernel(
				g1.nodes[n1][node_label], g2.nodes[n2][node_label])
		kernel += s * value  # ref [1] equation (6)

	return kernel
		
		
[docs]def wrapper_marg_do(node_label, edge_label, p_quit, n_iteration, itr):
	i= itr[0]
	j = itr[1]
	return i, j, _marginalizedkernel_do(G_gn[i], G_gn[j], node_label, edge_label, p_quit, n_iteration)
	

[docs]def wrapper_untotter(Gn, node_label, edge_label, i):
	return i, untotterTransformation(Gn[i], node_label, edge_label)