deepwalk源码解读6: graph.py

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/github_36326955/article/details/82702379

1. collections.defaultdict

  1. Python中collections.defaultdict()使用
  2. collections — Container datatypes

2. 源码解读


#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""Graph utilities."""

import logging

from io import open
from time import time
from six.moves import range, zip, zip_longest
from six import iterkeys
from collections import defaultdict, Iterable
import random
from itertools import permutations
from scipy.io import loadmat
from scipy.sparse import issparse

from concurrent.futures import ProcessPoolExecutor

from multiprocessing import cpu_count

logger = logging.getLogger("deepwalk")

__author__ = "Bryan Perozzi"
__email__ = "[email protected]"

LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"


class Graph(defaultdict):
    """Efficient basic implementation of nx `Graph' – Undirected graphs with self loops"""

    def __init__(self):
        """
        这句话的意思是,构建的图是一个字典,key是节点,key对应的value是list,表示该节点所在边的另一端节点序列
        Graph的__init__直接订制于defaultdict的__init__(default_factory=None, **kwargs)
        其中default_factory在这里被制定为list,即字典的value类型全部为list
        事实上如果你构建了一个Graph实例然后print出来后,会得到类似的输出形式:

        defaultdict(
                        <class 'list'>,

                        {
                            1: [2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 18, 20, 22, 32],
                            2: [1, 3, 4, 8, 14, 18, 20, 22, 31],
                            ……
                            34: [9, 10, 14, 15, 16, 19, 20, 21, 23, 24, 27, 28, 29, 30, 31, 32, 33]
                        }
                    )
        """
        super(Graph, self).__init__(list)

    def nodes(self):
        """返回图中的所有节点"""
        return self.keys()

    def adjacency_iter(self):
        """
        iteritems() 和 viewitems() 是python2.x中dict的函数
        Python 3.x 里面,iteritems() 和 viewitems() 这两个方法都已经废除了
        在Python2.x中,items( )用于返回一个字典的拷贝列表,占额外的内存。
        iteritems() 用于返回本身字典列表操作后的迭代,不占用额外的内存。
        Python 3.x 里面,items() 得到的结果是和 2.x 里面 viewitems() 一致的。
        在3.x 里 用 items()替换iteritems() ,可以用于 for 来循环遍历。
        """

        """
        dict_items([(1, [2, 4, 5, 6]), (2, [1, 3]),...,)
        """
        return self.items()  # 源代码这里写的是iteritems()

    def subgraph(self, nodes={}):
        subgraph = Graph()

        for n in nodes:
            if n in self:
                subgraph[n] = [x for x in self[n] if x in nodes]

        return subgraph

    def make_undirected(self):
        """
        将有向图变为无向图
        :return:
        """

        t0 = time()

        for v in self.keys():
            for other in self[v]:
                if v != other:
                    # if v != other and v not in self[other]:
                    # #这行代码可以替代上一行,不需要下面的去重操作,但是需要排序
                    self[other].append(v)

        t1 = time()
        logger.info('make_directed: added missing edges {}s'.format(t1 - t0))

        self.make_consistent()
        """这个函数的作用是:
        1. 用来去除重复的,因为之前的工作会得到酱紫的输出:
            {1: [2, 5, 6, 2, 4, 5, 6], 2: [1, 3], ...}
        2. 同时,这个函数还可以保证每一项的list是从小到达排序的。
        3. 最后这个函数内部调用了remove_self_loops(),可以去除掉自循环

        """

        return self

    def make_consistent(self):
        """这个函数的作用是:
               1. 用来去除重复的,因为之前的工作会得到酱紫的输出:
                   {1: [2, 5, 6, 2, 4, 5, 6], 2: [1, 3], ...}
               2. 同时,这个函数还可以保证每一项的list是从小到达排序的。
               3. 最后这个函数内部调用了remove_self_loops(),可以去除掉自循环

               """
        t0 = time()
        for k in iterkeys(self):
            self[k] = list(sorted(set(self[k])))
            # 原来的每一项list self[k]先进行set函数去重,
            # 然后经过sorted函数进行排序,
            # 最后转化为list进行重新赋值
        t1 = time()
        logger.info('make_consistent: made consistent in {}s'.format(t1 - t0))

        # print("not remove_self_loops", self)
        self.remove_self_loops()
        # print("remove_self_loops:",self)

        return self

    def remove_self_loops(self):
        """
        去除掉节点到自己的自循环
        :return:
        """
        removed = 0
        t0 = time()

        for x in self:
            if x in self[x]:
                self[x].remove(x)
                removed += 1

        t1 = time()

        logger.info('remove_self_loops: removed {} loops in {}s'.format(removed, (t1 - t0)))
        return self

    def check_self_loops(self):
        """
        检查是否含有自循环(节点v到节点v自己到路径)
        :return:
        """
        for x in self:
            for y in self[x]:
                if x == y:
                    return True

        return False

    def has_edge(self, v1, v2):
        """
        判断两个节点之间是否有边
        :param v1:
        :param v2:
        :return:
        """
        if v2 in self[v1] or v1 in self[v2]:
            return True
        return False

    def degree(self, nodes=None):
        """
        这个函数用来求节点到度数对于有向图而言,求到是入度或初度(具体是哪个要看你到dic是根据什么构建的)
        输入的nodes参数可以是一个节点编号,也可以是一系列节点的可迭代对象如list。如果是单个节点,则直接返回
        节点的度,如果是一些列节点,则返回dic,key对应为目标节点,value为该节点对应的度数。
        :param nodes:
        :return:
        """
        if isinstance(nodes, Iterable):
            return {v: len(self[v]) for v in nodes}
        else:
            return len(self[nodes])

    def order(self):
        "Returns the number of nodes in the graph"
        return len(self)

    def number_of_edges(self):
        "Returns the number of nodes in the graph"
        """
        请注意,这里的边的个数对应的是无向图,如果是有向图,需要去掉除以2的操作
        """

        return sum([self.degree(x) for x in self.keys()]) / 2

    def number_of_nodes(self):
        "Returns the number of nodes in the graph"
        return self.order()  #

    def random_walk(self, path_length, alpha=0, rand=random.Random(), start=None):
        """ Returns a truncated random walk.
            返回一些较短的随机游走路径
            path_length: Length of the random walk.
            alpha: probability of restarts.
            start: the start node of the random walk.
            random.Random():产生0-1之间的随机浮点数

            请注意:这里的随机游走路径未必是连续的,有可能是走着走着突然回到起点接着走
        """
        G = self
        if start:
            path = [start]
        else:
            # Sampling is uniform w.r.t V, and not w.r.t E
            path = [rand.choice(list(G.keys()))]

        while len(path) < path_length:
            cur = path[-1]
            if len(G[cur]) > 0:
                if rand.random() >= alpha:  # 这是一个经典的概率编程的语句
                    """
                    这条语句成立的概率是1-alpha
                    """
                    path.append(rand.choice(G[cur]))
                else:
                    path.append(path[0])  # 这里应该写的是path.append(cur)吧?
                    """
                    以概率1-alpha从当前节点继续向前走,或者以alpha的概率回到路径的起点
                    """
            else:
                break
        return [str(node) for node in path]


def build_deepwalk_corpus(G, num_paths, path_length, alpha=0,
                          rand=random.Random(0)):
    """
    这个函数可以对一个图生成一个语料库
    :param G:
    :param num_paths:
    :param path_length:
    :param alpha:
    :param rand:
    :return:
    """
    walks = []

    nodes = list(G.nodes())

    for cnt in range(num_paths):
        # 这条语句将下面的random walk过程重复了num_paths次,
        # 相当于每一个节点做num_paths次random walk
        rand.shuffle(nodes)
        for node in nodes:  # 这条语句对图所有的节点各自进行一次random walk
            walks.append(G.random_walk(path_length, rand=rand, alpha=alpha, start=node))

    return walks


def build_deepwalk_corpus_iter(G, num_paths, path_length, alpha=0,
                               rand=random.Random(0)):
    """
    这个函数可以迭代地生成语料库
    :param G:
    :param num_paths:
    :param path_length:
    :param alpha:
    :param rand:
    :return:
    """

    nodes = list(G.nodes())

    for cnt in range(num_paths):
        rand.shuffle(nodes)
        for node in nodes:
            yield G.random_walk(path_length, rand=rand, alpha=alpha, start=node)


def clique(size):
    """

    :param size:
    :return:
    """

    return from_adjlist(permutations(range(1, size + 1)))


# http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python
def grouper(n, iterable, padvalue=None):
    "grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')"
    return zip_longest(*[iter(iterable)] * n, fillvalue=padvalue)


def parse_adjacencylist(f):
    adjlist = []
    for l in f:
        if l and l[0] != "#":
            introw = [int(x) for x in l.strip().split()]
            row = [introw[0]]
            row.extend(set(sorted(introw[1:])))
            adjlist.extend([row])

    return adjlist


def parse_adjacencylist_unchecked(f):
    adjlist = []
    for l in f:
        if l and l[0] != "#":
            adjlist.extend([[int(x) for x in l.strip().split()]])

    return adjlist


def load_adjacencylist(file_, undirected=False, chunksize=10000, unchecked=True):
    if unchecked:
        parse_func = parse_adjacencylist_unchecked
        convert_func = from_adjlist_unchecked
    else:
        parse_func = parse_adjacencylist
        convert_func = from_adjlist

    adjlist = []

    t0 = time()

    with open(file_) as f:
        with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
            total = 0
            for idx, adj_chunk in enumerate(executor.map(parse_func, grouper(int(chunksize), f))):
                adjlist.extend(adj_chunk)
                total += len(adj_chunk)

    t1 = time()

    logger.info('Parsed {} edges with {} chunks in {}s'.format(total, idx, t1 - t0))

    t0 = time()
    G = convert_func(adjlist)
    t1 = time()

    logger.info('Converted edges to graph in {}s'.format(t1 - t0))

    if undirected:
        t0 = time()
        G = G.make_undirected()
        t1 = time()
        logger.info('Made graph undirected in {}s'.format(t1 - t0))

    return G


def load_edgelist(file_, undirected=True):
    G = Graph()
    with open(file_) as f:
        for l in f:
            x, y = l.strip().split()[:2]
            x = int(x)
            y = int(y)
            G[x].append(y)
            if undirected:
                G[y].append(x)

    G.make_consistent()
    return G


def load_matfile(file_, variable_name="network", undirected=True):
    mat_varables = loadmat(file_)
    mat_matrix = mat_varables[variable_name]

    return from_numpy(mat_matrix, undirected)


def from_networkx(G_input, undirected=True):
    G = Graph()

    for idx, x in enumerate(G_input.nodes_iter()):
        for y in iterkeys(G_input[x]):
            G[x].append(y)

    if undirected:
        G.make_undirected()

    return G


def from_numpy(x, undirected=True):
    G = Graph()

    if issparse(x):
        cx = x.tocoo()
        for i, j, v in zip(cx.row, cx.col, cx.data):
            G[i].append(j)
    else:
        raise Exception("Dense matrices not yet supported.")

    if undirected:
        G.make_undirected()

    G.make_consistent()
    return G


def from_adjlist(adjlist):
    G = Graph()

    for row in adjlist:
        node = row[0]
        neighbors = row[1:]
        G[node] = list(sorted(set(neighbors)))

    return G


def from_adjlist_unchecked(adjlist):
    G = Graph()

    for row in adjlist:
        node = row[0]
        neighbors = row[1:]
        G[node] = neighbors

    return G


if __name__ == "__main__":
    g = Graph()
    g[1] = [2]
    g[2] = [3]
    g[3] = [4]
    g[4] = [5]
    g[5] = [6]
    g[6] = [7]
    g[7] = [8]
    g[8] = [9]
    g[9] = [10]
    g[10] = []

    # print(g.nodes())
    # print(g.adjacency_iter())
    # for key, edge_nodes in g.adjacency_iter():
    #     for edge_node in edge_nodes:
    #         print(key, "---", edge_node)
    # print(g.subgraph({2,5, 6, 7}))
    print(g)
    g.make_undirected()
    print(g)
    print(g.random_walk(10, 1, start=1))

猜你喜欢

转载自blog.csdn.net/github_36326955/article/details/82702379