教程:在本地机器上训练节点分类模型¶
本教程展示了一个端到端示例,说明GraphScope如何为节点分类任务训练EgoGraphSAGE模型。为此,我们使用了ogbn-mag数据集,这是一个异构的学术引用网络,构成了更大的微软学术图谱的一个子集。该数据集包含四种类型的实体,包括论文、作者、机构和研究领域。此外,它还包含四种类型的有向关系。
关于异构的ogbn-mag数据,GNN任务的目标是预测论文的类别。为实现这一目标,我们同时利用属性和结构信息对论文进行分类。具体而言,在该图中,每个论文节点都包含一个128维的word2vec向量来表示其内容。这个向量是通过对论文标题和摘要中词语嵌入取平均得到的。值得注意的是,单个词语的嵌入是经过预训练的。
加载图¶
try:
# https://www.tensorflow.org/guide/migrate
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
except ImportError:
import tensorflow as tf
import graphscope as gs
from graphscope.dataset import load_ogbn_mag
from graphscope.learning.examples import EgoGraphSAGE
from graphscope.learning.examples import EgoSAGESupervisedDataLoader
from graphscope.learning.examples.tf.trainer import LocalTrainer
# Enable logging
gs.set_option(show_log=True)
# load the obgn-mag graph as example.
graph = load_ogbn_mag()
# print the schema of the graph
print(graph)
定义EgoGraphSAGE模型的训练流程¶
def train(graph, node_type, edge_type, class_num, features_num,
hops_num=2, nbrs_num=[25, 10], epochs=2,
hidden_dim=256, in_drop_rate=0.5, learning_rate=0.01
):
gs.learning.reset_default_tf_graph()
dimensions = [features_num] + [hidden_dim] * (hops_num - 1) + [class_num]
model = EgoGraphSAGE(dimensions, act_func=tf.nn.relu, dropout=in_drop_rate)
# prepare the training dataset
train_data = EgoSAGESupervisedDataLoader(
graph, gs.learning.Mask.TRAIN,
node_type=node_type, edge_type=edge_type,
nbrs_num=nbrs_num, hops_num=hops_num,
)
train_embedding = model.forward(train_data.src_ego)
train_labels = train_data.src_ego.src.labels
loss = tf.reduce_mean(
tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=train_labels, logits=train_embedding,
)
)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
# prepare the test dataset
test_data = EgoSAGESupervisedDataLoader(
graph, gs.learning.Mask.TEST,
node_type=node_type, edge_type=edge_type,
nbrs_num=nbrs_num, hops_num=hops_num,
)
test_embedding = model.forward(test_data.src_ego)
test_labels = test_data.src_ego.src.labels
test_indices = tf.math.argmax(test_embedding, 1, output_type=tf.int32)
test_acc = tf.div(
tf.reduce_sum(tf.cast(tf.math.equal(test_indices, test_labels), tf.float32)),
tf.cast(tf.shape(test_labels)[0], tf.float32),
)
# train and test
trainer = LocalTrainer()
trainer.train(train_data.iterator, loss, optimizer, epochs=epochs)
trainer.test(test_data.iterator, test_acc)
启动学习引擎¶
# define the features for learning, we chose the original 128-dimension feature
i_features = []
for i in range(128):
i_features.append("feat_" + str(i))
# launch a learning engine, here we split the dataset, 75% as train, 10% as validation and 15% as test.
lg = sess.graphlearn(
graph,
nodes=[("paper", i_features)],
edges=[("paper", "cites", "paper")],
gen_labels=[
("train", "paper", 100, (0, 75)),
("val", "paper", 100, (75, 85)),
("test", "paper", 100, (85, 100)),
],
)
训练模型¶
train(lg, node_type="paper", edge_type="cites",
class_num=349, # output dimension
features_num=128, # input dimension
)