Source code for gklearn.kernels.marginalizedKernel

"""
@author: linlin

@references:

	[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between 
	labeled graphs. In Proceedings of the 20th International Conference on 
	Machine Learning, Washington, DC, United States, 2003.

	[2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and 
	Jean-Philippe Vert. Extensions of marginalized graph kernels. In 
	Proceedings of the twenty-first international conference on Machine 
	learning, page 70. ACM, 2004.
"""

import sys
import time
from functools import partial
from multiprocessing import Pool
from tqdm import tqdm
tqdm.monitor_interval = 0
#import traceback

import networkx as nx
import numpy as np

from gklearn.utils.kernels import deltakernel
from gklearn.utils.utils import untotterTransformation
from gklearn.utils.graphdataset import get_dataset_attributes
from gklearn.utils.parallel import parallel_gm


[docs]def marginalizedkernel(*args, node_label='atom', edge_label='bond_type', p_quit=0.5, n_iteration=20, remove_totters=False, n_jobs=None, chunksize=None, verbose=True): """Compute marginalized graph kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are computed. G1, G2 : NetworkX graphs Two graphs between which the kernel is computed. node_label : string Node attribute used as symbolic label. The default node label is 'atom'. edge_label : string Edge attribute used as symbolic label. The default edge label is 'bond_type'. p_quit : integer The termination probability in the random walks generating step. n_iteration : integer Time of iterations to compute R_inf. remove_totters : boolean Whether to remove totterings by method introduced in [2]. The default value is False. n_jobs : int Number of jobs for parallelization. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the marginalized kernel between 2 praphs. """ # pre-process n_iteration = int(n_iteration) Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()] Gn = [g.copy() for g in Gn] ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'edge_labeled', 'is_directed'], node_label=node_label, edge_label=edge_label) if not ds_attrs['node_labeled'] or node_label is None: node_label = 'atom' for G in Gn: nx.set_node_attributes(G, '0', 'atom') if not ds_attrs['edge_labeled'] or edge_label is None: edge_label = 'bond_type' for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') start_time = time.time() if remove_totters: # ---- use pool.imap_unordered to parallel and track progress. ---- pool = Pool(n_jobs) untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label) if chunksize is None: if len(Gn) < 100 * n_jobs: chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 for i, g in tqdm( pool.imap_unordered( untotter_partial, range(0, len(Gn)), chunksize), desc='removing tottering', file=sys.stdout): Gn[i] = g pool.close() pool.join() # # ---- direct running, normally use single CPU core. ---- # Gn = [ # untotterTransformation(G, node_label, edge_label) # for G in tqdm(Gn, desc='removing tottering', file=sys.stdout) # ] Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_partial = partial(wrapper_marg_do, node_label, edge_label, p_quit, n_iteration) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) # # ---- direct running, normally use single CPU core. ---- ## pbar = tqdm( ## total=(1 + len(Gn)) * len(Gn) / 2, ## desc='Computing kernels', ## file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): ## print(i, j) # Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label, # edge_label, p_quit, n_iteration) # Kmatrix[j][i] = Kmatrix[i][j] ## pbar.update(1) run_time = time.time() - start_time if verbose: print("\n --- marginalized kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time
def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration): """Compute marginalized graph kernel between 2 graphs. Parameters ---------- G1, G2 : NetworkX graphs 2 graphs between which the kernel is computed. node_label : string node attribute used as label. edge_label : string edge attribute used as label. p_quit : integer the termination probability in the random walks generating step. n_iteration : integer time of iterations to compute R_inf. Return ------ kernel : float Marginalized Kernel between 2 graphs. """ # init parameters kernel = 0 num_nodes_G1 = nx.number_of_nodes(g1) num_nodes_G2 = nx.number_of_nodes(g2) # the initial probability distribution in the random walks generating step # (uniform distribution over |G|) p_init_G1 = 1 / num_nodes_G1 p_init_G2 = 1 / num_nodes_G2 q = p_quit * p_quit r1 = q # # initial R_inf # # matrix to save all the R_inf for all pairs of nodes # R_inf = np.zeros([num_nodes_G1, num_nodes_G2]) # # # Compute R_inf with a simple interative method # for i in range(1, n_iteration): # R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2]) # R_inf_new.fill(r1) # # # Compute R_inf for each pair of nodes # for node1 in g1.nodes(data=True): # neighbor_n1 = g1[node1[0]] # # the transition probability distribution in the random walks # # generating step (uniform distribution over the vertices adjacent # # to the current vertex) # if len(neighbor_n1) > 0: # p_trans_n1 = (1 - p_quit) / len(neighbor_n1) # for node2 in g2.nodes(data=True): # neighbor_n2 = g2[node2[0]] # if len(neighbor_n2) > 0: # p_trans_n2 = (1 - p_quit) / len(neighbor_n2) # # for neighbor1 in neighbor_n1: # for neighbor2 in neighbor_n2: # t = p_trans_n1 * p_trans_n2 * \ # deltakernel(g1.node[neighbor1][node_label], # g2.node[neighbor2][node_label]) * \ # deltakernel( # neighbor_n1[neighbor1][edge_label], # neighbor_n2[neighbor2][edge_label]) # # R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][ # neighbor2] # ref [1] equation (8) # R_inf[:] = R_inf_new # # # add elements of R_inf up and compute kernel. # for node1 in g1.nodes(data=True): # for node2 in g2.nodes(data=True): # s = p_init_G1 * p_init_G2 * deltakernel( # node1[1][node_label], node2[1][node_label]) # kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6) R_inf = {} # dict to save all the R_inf for all pairs of nodes # initial R_inf, the 1st iteration. for node1 in g1.nodes(): for node2 in g2.nodes(): # R_inf[(node1[0], node2[0])] = r1 if len(g1[node1]) > 0: if len(g2[node2]) > 0: R_inf[(node1, node2)] = r1 else: R_inf[(node1, node2)] = p_quit else: if len(g2[node2]) > 0: R_inf[(node1, node2)] = p_quit else: R_inf[(node1, node2)] = 1 # compute all transition probability first. t_dict = {} if n_iteration > 1: for node1 in g1.nodes(): neighbor_n1 = g1[node1] # the transition probability distribution in the random walks # generating step (uniform distribution over the vertices adjacent # to the current vertex) if len(neighbor_n1) > 0: p_trans_n1 = (1 - p_quit) / len(neighbor_n1) for node2 in g2.nodes(): neighbor_n2 = g2[node2] if len(neighbor_n2) > 0: p_trans_n2 = (1 - p_quit) / len(neighbor_n2) for neighbor1 in neighbor_n1: for neighbor2 in neighbor_n2: t_dict[(node1, node2, neighbor1, neighbor2)] = \ p_trans_n1 * p_trans_n2 * \ deltakernel(g1.nodes[neighbor1][node_label], g2.nodes[neighbor2][node_label]) * \ deltakernel( neighbor_n1[neighbor1][edge_label], neighbor_n2[neighbor2][edge_label]) # Compute R_inf with a simple interative method for i in range(2, n_iteration + 1): R_inf_old = R_inf.copy() # Compute R_inf for each pair of nodes for node1 in g1.nodes(): neighbor_n1 = g1[node1] # the transition probability distribution in the random walks # generating step (uniform distribution over the vertices adjacent # to the current vertex) if len(neighbor_n1) > 0: for node2 in g2.nodes(): neighbor_n2 = g2[node2] if len(neighbor_n2) > 0: R_inf[(node1, node2)] = r1 for neighbor1 in neighbor_n1: for neighbor2 in neighbor_n2: R_inf[(node1, node2)] += \ (t_dict[(node1, node2, neighbor1, neighbor2)] * \ R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) # add elements of R_inf up and compute kernel. for (n1, n2), value in R_inf.items(): s = p_init_G1 * p_init_G2 * deltakernel( g1.nodes[n1][node_label], g2.nodes[n2][node_label]) kernel += s * value # ref [1] equation (6) return kernel
[docs]def wrapper_marg_do(node_label, edge_label, p_quit, n_iteration, itr): i= itr[0] j = itr[1] return i, j, _marginalizedkernel_do(G_gn[i], G_gn[j], node_label, edge_label, p_quit, n_iteration)
[docs]def wrapper_untotter(Gn, node_label, edge_label, i): return i, untotterTransformation(Gn[i], node_label, edge_label)