教程:在本地机器上训练节点分类模型

本教程展示了一个端到端示例,说明GraphScope如何为节点分类任务训练EgoGraphSAGE模型。为此,我们使用了ogbn-mag数据集,这是一个异构的学术引用网络,构成了更大的微软学术图谱的一个子集。该数据集包含四种类型的实体,包括论文、作者、机构和研究领域。此外,它还包含四种类型的有向关系。

关于异构的ogbn-mag数据,GNN任务的目标是预测论文的类别。为实现这一目标,我们同时利用属性和结构信息对论文进行分类。具体而言,在该图中,每个论文节点都包含一个128维的word2vec向量来表示其内容。这个向量是通过对论文标题和摘要中词语嵌入取平均得到的。值得注意的是,单个词语的嵌入是经过预训练的。

加载图

try:
    # https://www.tensorflow.org/guide/migrate
    import tensorflow.compat.v1 as tf
    tf.disable_v2_behavior()
except ImportError:
    import tensorflow as tf

import graphscope as gs
from graphscope.dataset import load_ogbn_mag
from graphscope.learning.examples import EgoGraphSAGE
from graphscope.learning.examples import EgoSAGESupervisedDataLoader
from graphscope.learning.examples.tf.trainer import LocalTrainer

# Enable logging
gs.set_option(show_log=True)

# load the obgn-mag graph as example.
graph = load_ogbn_mag()

# print the schema of the graph
print(graph)

定义EgoGraphSAGE模型的训练流程

def train(graph, node_type, edge_type, class_num, features_num,
              hops_num=2, nbrs_num=[25, 10], epochs=2,
              hidden_dim=256, in_drop_rate=0.5, learning_rate=0.01
):
    gs.learning.reset_default_tf_graph()

    dimensions = [features_num] + [hidden_dim] * (hops_num - 1) + [class_num]
    model = EgoGraphSAGE(dimensions, act_func=tf.nn.relu, dropout=in_drop_rate)

    # prepare the training dataset
    train_data = EgoSAGESupervisedDataLoader(
        graph, gs.learning.Mask.TRAIN,
        node_type=node_type, edge_type=edge_type,
        nbrs_num=nbrs_num, hops_num=hops_num,
    )
    train_embedding = model.forward(train_data.src_ego)
    train_labels = train_data.src_ego.src.labels
    loss = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=train_labels, logits=train_embedding,
        )
    )
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

    # prepare the test dataset
    test_data = EgoSAGESupervisedDataLoader(
        graph, gs.learning.Mask.TEST,
        node_type=node_type, edge_type=edge_type,
        nbrs_num=nbrs_num, hops_num=hops_num,
    )
    test_embedding = model.forward(test_data.src_ego)
    test_labels = test_data.src_ego.src.labels
    test_indices = tf.math.argmax(test_embedding, 1, output_type=tf.int32)
    test_acc = tf.div(
        tf.reduce_sum(tf.cast(tf.math.equal(test_indices, test_labels), tf.float32)),
        tf.cast(tf.shape(test_labels)[0], tf.float32),
    )

    # train and test
    trainer = LocalTrainer()
    trainer.train(train_data.iterator, loss, optimizer, epochs=epochs)
    trainer.test(test_data.iterator, test_acc)

启动学习引擎

# define the features for learning, we chose the original 128-dimension feature
i_features = []
for i in range(128):
    i_features.append("feat_" + str(i))

# launch a learning engine, here we split the dataset, 75% as train, 10% as validation and 15% as test.
lg = sess.graphlearn(
    graph,
    nodes=[("paper", i_features)],
    edges=[("paper", "cites", "paper")],
    gen_labels=[
        ("train", "paper", 100, (0, 75)),
        ("val", "paper", 100, (75, 85)),
        ("test", "paper", 100, (85, 100)),
    ],
)

训练模型

train(lg, node_type="paper", edge_type="cites",
      class_num=349,  # output dimension
      features_num=128,  # input dimension
      )