session.run模式下使能训练迭代循环下沉
自动迁移场景
session.run模式下,通过set_iteration_per_loop设置iterations_per_loop参数,并修改session.run调用次数为原调用次数除以iterations_per_loop。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
from __future__ import print_function import input_data from npu_bridge.npu_init import * mnist = input_data.read_data_sets("/test/", one_hot=True) import tensorflow as tf # 设置模型 # 学习率 learning_rate = 0.01 # 训练迭代次数 training_epochs = 10 # batch大小 batch_size = 100 # 每多少次迭代显示一次损失 display_step = 1 x = tf.placeholder(tf.float32, [None, 784]) y = tf.placeholder(tf.float32, [None, 10]) # 模型参数 W = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) # 建立模型 pred = tf.nn.softmax(tf.matmul(x, W) + b) # 定义损失函数:交叉熵 cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1)) # 梯度更新 optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) # 初始化所有变量 init = tf.global_variables_initializer() config = tf.ConfigProto(allow_soft_placement=True) custom_op = config.graph_options.rewrite_options.custom_optimizers.add() custom_op.name = "NpuOptimizer" custom_op.parameter_map["mix_compile_mode"].b = False # 关闭混合计算,根据实际情况配置,默认关闭 custom_op.parameter_map["enable_data_pre_proc"].b = True # 若网络中存在getnext算子,需要设置getnext算子下沉,getnext算子下沉是迭代循环下沉的必要条件 custom_op.parameter_map["iterations_per_loop"].i = 10 # 此处设置的值和set_iteration_per_loop设置的iterations_per_loop值保持一致,用于判断是否进行训练迭代下沉 config = npu_config_proto(config_proto=config) # 训练模型 with tf.Session(config=config) as sess: sess.run(init) # sess.run模式下设置小循环次数为10 train_op = util.set_iteration_per_loop(sess, optimizer, 10) # 其中sess为已经创建的TensorFlow会话,optimizer为已定义的更新梯度的操作,10为设置的在Device侧进行训练迭代的次数 for epoch in range(training_epochs): avg_cost = 0 total_batch = int(mnist.train.num_examples / batch_size) for i in range(total_batch): batch_xs, batch_ys = mnist.train.next_batch(batch_size) _, c = sess.run([train_op, cost], feed_dict={x: batch_xs, y: batch_ys}) avg_cost += c / total_batch |
set_iteration_per_loop接口存在改图的操作,如果图无法修改(例如冻结了图或者使用tf.train.Supervisor创建session等),则无法使用set_iteration_per_loop接口设置小循环次数。此种场景下开发者可使用create_iteration_per_loop_var和load_iteration_per_loop_var接口设置小循环次数,如下述示例代码中的粗体部分。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
from __future__ import print_function import input_data from npu_bridge.npu_init import * mnist = input_data.read_data_sets("/test/", one_hot=True) import tensorflow as tf # 设置模型 # 学习率 learning_rate = 0.01 # 训练迭代次数 training_epochs = 10 # batch大小 batch_size = 100 # 每多少次迭代显示一次损失 display_step = 1 x = tf.placeholder(tf.float32, [None, 784]) y = tf.placeholder(tf.float32, [None, 10]) # 模型参数 W = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) # 建立模型 pred = tf.nn.softmax(tf.matmul(x, W) + b) # 定义损失函数:交叉熵 cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1)) # 梯度更新 optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) # 初始化所有变量 init = tf.global_variables_initializer() config = tf.ConfigProto(allow_soft_placement=True) custom_op = config.graph_options.rewrite_options.custom_optimizers.add() custom_op.name = "NpuOptimizer" custom_op.parameter_map["mix_compile_mode"].b = False # 关闭混合计算,根据实际情况配置,默认关闭 custom_op.parameter_map["enable_data_pre_proc"].b = True # 若网络中存在getnext算子,需要设置getnext算子下沉,getnext算子下沉是迭代循环下沉的必要条件 custom_op.parameter_map["iterations_per_loop"].i = 10 # 此处设置的值和load_iteration_per_loop_var设置的iterations_per_loop值保持一致,用于功能校验 config = npu_config_proto(config_proto=config) # 训练模型 with tf.Session(config=config) as sess: sess.run(init) # sess.run模式下设置小循环次数为10 iteration = util.IterationPerLoop() # 定义IterationPerLoop类对象 train_op = iteration.create_iteration_per_loop_var(optimizer) # optimizer为已定义的更新梯度的操作 tf.train.Supervisor(logdir="/home/xxxx",init_op=init) # 冻结图 iteration.load_iteration_per_loop_var(sess, 10) # 设置小循环次数,其中sess为已经创建的TensorFlow会话,10为设置的在Device侧进行训练迭代的次数 for epoch in range(training_epochs): avg_cost = 0 total_batch = int(mnist.train.num_examples / batch_size) for i in range(total_batch): batch_xs, batch_ys = mnist.train.next_batch(batch_size) _, c = sess.run([train_op, cost], feed_dict={x: batch_xs, y: batch_ys}) avg_cost += c / total_batch |
手工迁移场景
session.run模式下,通过set_iteration_per_loop设置iterations_per_loop参数,并修改session.run调用次数为原调用次数除以iterations_per_loop,如下述示例代码中的粗体部分。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
from __future__ import print_function import input_data from npu_bridge.npu_init import * mnist = input_data.read_data_sets("/test/", one_hot=True) import tensorflow as tf # 设置模型 # 学习率 learning_rate = 0.01 # 训练迭代次数 training_epochs = 10 # batch大小 batch_size = 100 # 每多少次迭代显示一次损失 display_step = 1 x = tf.placeholder(tf.float32, [None, 784]) y = tf.placeholder(tf.float32, [None, 10]) # 模型参数 W = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) # 建立模型 pred = tf.nn.softmax(tf.matmul(x, W) + b) # 定义损失函数:交叉熵 cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1)) # 梯度更新 optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) # 初始化所有变量 init = tf.global_variables_initializer() config = tf.ConfigProto(allow_soft_placement=True) custom_op = config.graph_options.rewrite_options.custom_optimizers.add() custom_op.name = "NpuOptimizer" custom_op.parameter_map["use_off_line"].b = True # 在昇腾AI处理器执行训练 custom_op.parameter_map["mix_compile_mode"].b = False # 关闭混合计算,根据实际情况配置,默认关闭 custom_op.parameter_map["enable_data_pre_proc"].b = True # 若网络中存在getnext算子,需要设置getnext算子下沉,getnext算子下沉是迭代循环下沉的必要条件 custom_op.parameter_map["iterations_per_loop"].i = 10 # 此处设置的值和set_iteration_per_loop设置的iterations_per_loop值保持一致,用于判断是否进行训练迭代下沉 config.graph_options.rewrite_options.remapping = RewriterConfig.OFF config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF # 训练模型 with tf.Session(config=config) as sess: sess.run(init) # sess.run模式下设置小循环次数为10 train_op = util.set_iteration_per_loop(sess, optimizer, 10) # 其中sess为已经创建的TensorFlow会话,optimizer为已定义的更新梯度的操作,10为设置的在Device侧进行训练迭代的次数 for epoch in range(training_epochs): avg_cost = 0 total_batch = int(mnist.train.num_examples / batch_size) for i in range(total_batch): batch_xs, batch_ys = mnist.train.next_batch(batch_size) _, c = sess.run([train_op, cost], feed_dict={x: batch_xs, y: batch_ys}) avg_cost += c / total_batch |
set_iteration_per_loop接口存在改图的操作,如果图无法修改(例如冻结了图或者使用tf.train.Supervisor创建session等),则无法使用set_iteration_per_loop接口设置小循环次数。此种场景下开发者可使用create_iteration_per_loop_var和load_iteration_per_loop_var接口设置小循环次数,如下述示例代码中的粗体部分。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
from __future__ import print_function import input_data from npu_bridge.npu_init import * mnist = input_data.read_data_sets("/test/", one_hot=True) import tensorflow as tf # 设置模型 # 学习率 learning_rate = 0.01 # 训练迭代次数 training_epochs = 10 # batch大小 batch_size = 100 # 每多少次迭代显示一次损失 display_step = 1 x = tf.placeholder(tf.float32, [None, 784]) y = tf.placeholder(tf.float32, [None, 10]) # 模型参数 W = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) # 建立模型 pred = tf.nn.softmax(tf.matmul(x, W) + b) # 定义损失函数:交叉熵 cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1)) # 梯度更新 optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) # 初始化所有变量 init = tf.global_variables_initializer() config = tf.ConfigProto(allow_soft_placement=True) custom_op = config.graph_options.rewrite_options.custom_optimizers.add() custom_op.name = "NpuOptimizer" custom_op.parameter_map["use_off_line"].b = True # 在昇腾AI处理器执行训练 custom_op.parameter_map["mix_compile_mode"].b = False # 关闭混合计算,根据实际情况配置,默认关闭 custom_op.parameter_map["enable_data_pre_proc"].b = True # 若网络中存在getnext算子,需要设置getnext算子下沉,getnext算子下沉是迭代循环下沉的必要条件 custom_op.parameter_map["iterations_per_loop"].i = 10 # 此处设置的值和load_iteration_per_loop_var设置的iterations_per_loop值保持一致,用于判断是否进行训练迭代下沉 config.graph_options.rewrite_options.remapping = RewriterConfig.OFF config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF # 训练模型 with tf.Session(config=config) as sess: sess.run(init) # sess.run模式下设置小循环次数为10 iteration = util.IterationPerLoop() train_op = iteration.create_iteration_per_loop_var(optimizer) # optimizer为已定义的更新梯度的操作 tf.train.Supervisor(logdir="/home/xxxx",init_op=init) # 冻结图 iteration.load_iteration_per_loop_var(sess, 10) # 设置小循环次数,其中sess为已经创建的TensorFlow会话,10为设置的在Device侧进行训练迭代的次数 for epoch in range(training_epochs): avg_cost = 0 total_batch = int(mnist.train.num_examples / batch_size) for i in range(total_batch): batch_xs, batch_ys = mnist.train.next_batch(batch_size) _, c = sess.run([train_op, cost], feed_dict={x: batch_xs, y: batch_ys}) avg_cost += c / total_batch |
修改循环次数后,建议根据脚本实际情况调整其他相关loop参数,如获取单步耗时、迭代数的更新等。
检查iterations_per_loop生效
开启“训练迭代循环下沉”功能后,可通过查看Host侧INFO日志中是否存在关键字“Insert op success”来判断iterations_per_loop是否生效。
可通过如下命令设置Host侧日志级别为INFO,INFO日志的默认输出路径为“$HOME/ascend/log/run/plog/”。
export ASCEND_GLOBAL_LOG_LEVEL=1