Skip to content

Commit

Permalink
Significant Speed Improvements
Browse files Browse the repository at this point in the history
- Significantly increased JIT compilation coverage  (Currently the only thing preventing complete compilation is the TensorFlow calls)
- Changed how boards are given from CPU to GPU, now using between 80-200 bits per board (depending on occupancy), opposed to the previous 480
- Now using a fully JIT compiled PriorityBin implementation, by utilizing the linked list system used in the negamax search.
- Added the classes_and_structs.py file to hold JitClasses and Numpy structured scaler types (and basic functions involving them)
- Improved the test for ANN inference
- Various minor refactoring and code cleaning
  • Loading branch information
SamRagusa committed Nov 22, 2018
1 parent a2eed16 commit 90eee66
Show file tree
Hide file tree
Showing 13 changed files with 822 additions and 792 deletions.
13 changes: 5 additions & 8 deletions batch_first/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def get_table_and_array_for_set_of_dicts(dicts):
return index_lookup_table, array



def generate_move_filter_table():
"""
Generate a lookup table for the policy encoding described in the following paper:
Expand Down Expand Up @@ -93,7 +92,6 @@ def generate_move_filter_table():
return filter_table



MOVE_FILTER_LOOKUP = generate_move_filter_table()


Expand Down Expand Up @@ -171,7 +169,6 @@ def generate_move_filter_table():
LOSS_RESULT_SCORES[j] = np.nextafter(LOSS_RESULT_SCORES[j - 1], MAX_FLOAT32_VAL)



SIZE_EXPONENT_OF_TWO_FOR_TT_INDICES = np.uint8(30)
TT_HASH_MASK = np.uint64(2 ** (SIZE_EXPONENT_OF_TWO_FOR_TT_INDICES) - 1)

Expand Down Expand Up @@ -297,11 +294,11 @@ def power_set(iterable):
flip_vert_const_1 = np.uint64(0x00FF00FF00FF00FF)
flip_vert_const_2 = np.uint64(0x0000FFFF0000FFFF)

@nb.vectorize([nb.uint64(nb.uint64)])
def vectorized_flip_vertically(bb):
bb = ((bb >> 8) & flip_vert_const_1) | ((bb & flip_vert_const_1) << 8)
@nb.vectorize([nb.uint64(nb.uint64)], nopython=True)
def flip_vertically(bb):
bb = ((bb >> 8) & flip_vert_const_1) | ((bb & flip_vert_const_1) << 8)
bb = ((bb >> 16) & flip_vert_const_2) | ((bb & flip_vert_const_2) << 16)
bb = (bb >> 32) | (bb << 32)
bb = ( bb >> 32) | ( bb << 32)
return bb

def get_castling_lookup_tables():
Expand All @@ -310,7 +307,7 @@ def get_castling_lookup_tables():
possible_castling_rights[j] = np.uint64(functools.reduce(lambda x, y: x | y, set, np.uint64(0)))

white_turn_castling_tables = create_index_table(possible_castling_rights)
black_turn_castling_tables = create_index_table(vectorized_flip_vertically(possible_castling_rights))
black_turn_castling_tables = create_index_table(flip_vertically(possible_castling_rights))

return white_turn_castling_tables, black_turn_castling_tables, possible_castling_rights

Expand Down
98 changes: 86 additions & 12 deletions batch_first/anns/ann_creation_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,86 @@
from tensorflow.contrib import layers
from tensorflow.python import training

import chess

from functools import reduce

from batch_first.chestimator import get_board_data
from google.protobuf import text_format

from batch_first.numba_board import popcount


from tensorflow.contrib import tensorrt as trt




def parse_into_ann_input_inference(max_boards, convert_to_nhwc=False):
"""
NOTES:
1) If a constant/operation is typed in a confusing manor, it's so the entirely of this can be done on GPU
"""
possible_lookup_nums = np.arange(2 ** 16, dtype=np.uint16)
num_bits = popcount(possible_lookup_nums.astype(np.uint64))

location_lookup_ary = np.array([[[chess.square_rank(loc), chess.square_file(loc)] for loc in chess.SQUARES_180]], np.int32)
location_lookup_ary = np.ones([max_boards, 1, 1], np.int32) * location_lookup_ary

location_lookup_ary = location_lookup_ary.reshape([max_boards, 8, 8, 2])[:, ::-1]
location_lookup_ary = location_lookup_ary.reshape([max_boards, 4, 16, 2])

mask_getter = lambda n: np.unpackbits(np.frombuffer(n, dtype=np.uint8)[::-1])[::-1]
masks_to_gather_ary = np.array(list(map(mask_getter, possible_lookup_nums)), dtype=np.bool_)

pieces_from_nums = lambda n: [n >> 4, (n & np.uint8(0x0F))]
piece_lookup_ary = np.array(list(map(pieces_from_nums, possible_lookup_nums)), dtype=np.int32)

range_repeater = numpy_style_repeat_1d_creator(max_multiple=33, max_to_repeat=max_boards, out_type=tf.int64)

popcount_lookup = tf.constant(num_bits, tf.int64)
locations_for_masking = tf.constant(location_lookup_ary, tf.int64)
occupancy_mask_table = tf.constant(masks_to_gather_ary, tf.half)
piece_lookup_table = tf.constant(piece_lookup_ary, tf.int64)

ones_to_slice = tf.constant(np.ones(33 * max_boards), dtype=tf.float32) # This is used since there seems to be no simple/efficient way to broadcast for scatter_nd

piece_indicators = tf.placeholder(tf.int32, shape=[None], name="piece_filters") #Given as an array of uint8s
occupied_bbs = tf.placeholder(tf.int64, shape=[None], name="occupied_bbs") #Given as an array of uint64s

# The code below this comment defines ops which are run during inference

occupied_bitcasted = tf.cast(tf.bitcast(occupied_bbs, tf.uint16), dtype=tf.int32)

partial_popcounts = tf.gather(popcount_lookup, occupied_bitcasted, "byte_popcount_loopkup")
partial_popcounts = tf.cast(partial_popcounts, tf.int32)
occupied_popcounts = tf.reduce_sum(partial_popcounts, axis=-1, name="popcount_lookup_sum")

location_mask = tf.gather(occupancy_mask_table, occupied_bitcasted, "gather_location_mask")
location_mask = tf.cast(location_mask, tf.bool)
piece_coords = tf.boolean_mask(locations_for_masking, location_mask, "mask_desired_locations")

gathered_pieces = tf.gather(piece_lookup_table, piece_indicators, "gather_pieces")
piece_filter_indices = tf.reshape(gathered_pieces, [-1, 1])

repeated_board_numbers = range_repeater(occupied_popcounts)
board_numbers_for_concat = tf.expand_dims(repeated_board_numbers, -1)

# Removes either the last piece filter, or no filters (based on if the number of filters was odd and half of the final uint8 was padding)
piece_filter_indices = piece_filter_indices[:tf.shape(board_numbers_for_concat)[0]]

one_indices = tf.concat([board_numbers_for_concat, piece_filter_indices, piece_coords], axis=-1) #Should figure out how this can be done with (or similarly to) tf.parallel_stack

boards = tf.scatter_nd(
indices=one_indices,
updates=ones_to_slice[:tf.shape(one_indices)[0]],
shape=[tf.shape(occupied_bbs, out_type=tf.int64)[0], 15, 8, 8])

if convert_to_nhwc:
boards = tf.transpose(boards, [0,2,3,1])

return (piece_indicators, occupied_bbs), boards


def vec_and_transpose_op(vector, operation, output_type=None):
"""
Equivalent to running tf.cast(operation(tf.expand_dims(vector, 1), tf.expand_dims(vector, 0)), output_type)
Expand Down Expand Up @@ -76,9 +146,11 @@ def combine_graphdefs(graphdef_filenames, output_model_path, output_filename, ou
output_filename)


def remap_inputs(model_path, output_model_path, output_filename):
def remap_inputs(model_path, output_model_path, output_filename, max_batch_size=None):
with tf.Session() as sess:
placeholders, formatted_data = get_board_data()
with tf.device('/GPU:0'):
with tf.name_scope("input_parser"):
placeholders, formatted_data = parse_into_ann_input_inference(max_batch_size)

with open(model_path, 'r') as f:
graph_def = text_format.Parse(f.read(), tf.GraphDef())
Expand All @@ -98,10 +170,10 @@ def remap_inputs(model_path, output_model_path, output_filename):

def save_trt_graphdef(model_path, output_model_path, output_filename, output_node_names,
trt_memory_fraction=.5, total_video_memory=1.1e10,
max_batch_size=1000, write_as_text=True, ):
max_batch_size=1000, write_as_text=True):

#This would ideally be 1 instead of .75, but the GPU that this is running on is responsible for things like graphics
with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=.75 - trt_memory_fraction))) as sess:
#This would ideally be 1 instead of .85, but the GPU that this is running on is responsible for things like graphics
with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=.85 - trt_memory_fraction))) as sess:

with open(model_path, 'r') as f:
txt = f.read()
Expand Down Expand Up @@ -413,20 +485,23 @@ def metric_dict_creator(the_dict):
return metric_dict


def numpy_style_repeat_1d_creator(max_multiple=100, max_to_repeat=10000):
def numpy_style_repeat_1d_creator(max_multiple=100, max_to_repeat=10000, out_type=tf.int32):
board_num_lookup_ary = np.repeat(
np.arange(max_to_repeat),
np.full([max_to_repeat], max_multiple))
board_num_lookup_ary = board_num_lookup_ary.reshape(max_to_repeat, max_multiple)

def fn_to_return(multiples):
board_num_lookup_tensor = tf.constant(board_num_lookup_ary, dtype=tf.int32)
casted_multiples = tf.cast(multiples, dtype=tf.int32)
board_num_lookup_tensor = tf.constant(board_num_lookup_ary, dtype=out_type)

if multiples.dtype != tf.int32:
multiples = tf.cast(multiples, dtype=tf.int32)

padded_multiples = tf.pad(
casted_multiples,
multiples,
[[0, max_to_repeat - tf.shape(multiples)[0]]])

padded_multiples =tf.cast(padded_multiples, tf.float32)
padded_multiples = tf.cast(padded_multiples, tf.int32)
to_return = tf.boolean_mask(
board_num_lookup_tensor,
tf.sequence_mask(padded_multiples, maxlen=max_multiple))
Expand All @@ -435,7 +510,6 @@ def fn_to_return(multiples):
return fn_to_return



def count_tfrecords(filename):
return sum(1 for _ in tf.python_io.tf_record_iterator(filename))

Expand Down
60 changes: 28 additions & 32 deletions batch_first/anns/evaluation_ann.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,14 @@

import batch_first.anns.ann_creation_helper as ann_h

from batch_first.chestimator import get_board_data

tf.logging.set_verbosity(tf.logging.INFO)



def diag_comparison_model_fn(features, labels, mode, params):
"""
Generates an EstimatorSpec for a model which scores chess boards. It learns by maximizing the difference between
two board evaluations, where one is intended to be greater than the other based on some pre-calculated
board evaluation values, where one is intended to be greater than the other based on some pre-calculated
scoring system (e.g. StockFish evaluations).
"""
convolutional_module_outputs=ann_h.create_input_convolutions_shared_weights(
Expand Down Expand Up @@ -45,7 +43,7 @@ def diag_comparison_model_fn(features, labels, mode, params):
kernel_regularizer=params['kernel_regularizer'](),
name="logit_layer")

logits = tf.squeeze(logits)
logits = tf.squeeze(logits, axis=[1,2,3])

loss = None
train_op = None
Expand Down Expand Up @@ -188,28 +186,29 @@ def process_batch(records):

comparison_indices = tf.where(bool_weight_mask)

value_larger_than_centipawn_less_than_mate = 100000
desired_found_mate = tf.greater(tf.abs(parsed_examples['score']), value_larger_than_centipawn_less_than_mate)
# value_larger_than_centipawn_less_than_mate = 100000
# desired_found_mate = tf.greater(tf.abs(parsed_examples['score']), value_larger_than_centipawn_less_than_mate)

both_found_mate = ann_h.vec_and_transpose_op(desired_found_mate, tf.logical_and)
# both_found_mate = ann_h.vec_and_transpose_op(desired_found_mate, tf.logical_and)

desired_signs = tf.sign(parsed_examples['score'])
# desired_signs = tf.sign(parsed_examples['score'])

same_sign_matrix = ann_h.vec_and_transpose_op(desired_signs, tf.equal)
# same_sign_matrix = ann_h.vec_and_transpose_op(desired_signs, tf.equal)

both_same_player_mates = tf.logical_and(both_found_mate, same_sign_matrix)
# both_same_player_mates = tf.logical_and(both_found_mate, same_sign_matrix)

both_same_mate_and_nonzero_weight = tf.logical_and(both_same_player_mates, bool_weight_mask)
# both_same_mate_and_nonzero_weight = tf.logical_and(both_same_player_mates, bool_weight_mask)

same_mate_depth_diff_decrement = .95
# same_mate_depth_diff_decrement = .95

weight_helper = same_mate_depth_diff_decrement * tf.cast(both_same_mate_and_nonzero_weight, tf.float32)
# weight_helper = same_mate_depth_diff_decrement * tf.cast(both_same_mate_and_nonzero_weight, tf.float32)

mate_adjusted_weight_mask = weight_mask - weight_helper
# mate_adjusted_weight_mask = weight_mask - weight_helper

label_matrix = (lower_diag_sign + weight_mask)/2

return boards, parsed_examples['score'], label_matrix, mate_adjusted_weight_mask, comparison_indices
# return boards, parsed_examples['score'], label_matrix, mate_adjusted_weight_mask, comparison_indices
return boards, parsed_examples['score'], label_matrix, weight_mask, comparison_indices


dataset = dataset.map(process_batch, num_parallel_calls=num_things_in_parallel)
Expand All @@ -227,31 +226,27 @@ def process_batch(records):

def board_eval_serving_input_receiver(data_format="NCHW"):
def fn_to_return():
(piece_bbs, color_occupied_bbs, ep_squares, castling_lookup_indices, kings), formatted_data = get_board_data(data_format)

receiver_tensors = {"piece_bbs": piece_bbs,
"color_occupied_bbs": color_occupied_bbs,
"ep_squares": ep_squares,
"castling_lookup_indices": castling_lookup_indices,
"kings": kings}
placeholder_shape = [None, 15, 8, 8] if data_format=="NCHW" else [None, 8, 8, 15]

dict_for_model_fn = {"board": formatted_data}
for_remapping = tf.placeholder(tf.float32, placeholder_shape, "FOR_INPUT_MAPPING_transpose")

return tf.estimator.export.ServingInputReceiver(dict_for_model_fn, receiver_tensors)
receiver_tensors = {"board": for_remapping}
return tf.estimator.export.ServingInputReceiver(receiver_tensors, receiver_tensors)
return fn_to_return


def main(unused_par):
SAVE_MODEL_DIR = "/srv/tmp/diag_loss_3/pre_commit_test"

SAVE_MODEL_DIR = "/srv/tmp/diag_loss_3/pre_commit_test_2534111"
TRAINING_FILENAME_PATTERN = "/srv/databases/lichess_combined_methods_eval_databases/lichess_training.tfrecords"
VALIDATION_FILENAME_PATTERN = "/srv/databases/lichess_combined_methods_eval_databases/lichess_validation.tfrecords"
TRAIN_OP_SUMMARIES = ["gradient_norm", "gradients"]
NUM_INPUT_FILTERS = 15
OPTIMIZER = 'Adam'
TRAINING_SHUFFLE_BUFFER_SIZE = 32790000
TRAINING_SHUFFLE_BUFFER_SIZE = 16800000
TRAINING_BATCH_SIZE = 512 #The effective batch size used for the loss = n(n-1)/2 (where n is the number of boards in the batch)
VALIDATION_BATCH_SIZE = 1000
LOG_ITERATION_INTERVAL = 1500
LOG_ITERATION_INTERVAL = 2500
LEARNING_RATE = 2.5e-3
KERNEL_REGULARIZER = lambda: None
KERNEL_INITIALIZER = lambda: tf.contrib.layers.variance_scaling_initializer()
Expand All @@ -260,8 +255,8 @@ def main(unused_par):
SAME_MATE_DEPTH_DIFF_LOSS_WEIGHT_DECREMENT = .95
VALUE_LARGER_THAN_CENTIPAWN_LESS_THAN_MATE = 100000

num_examples_in_training_file = 32792847
num_examples_in_validation_file = 3857982
num_examples_in_training_file = 16851682
num_examples_in_validation_file = 1982551

BATCHES_IN_TRAINING_EPOCH = num_examples_in_training_file // TRAINING_BATCH_SIZE
BATCHES_IN_VALIDATION_EPOCH = num_examples_in_validation_file // VALIDATION_BATCH_SIZE
Expand All @@ -270,7 +265,7 @@ def main(unused_par):
learning_decay_function = lambda gs : tf.train.exponential_decay(LEARNING_RATE, gs,
BATCHES_IN_TRAINING_EPOCH, 0.96, staircase=True)

CONVOLUTIONAL_MODULES = [[[[512, 1], [128, 1]] + 4 * [[32, 3]] + [(16, 8)]]]
CONVOLUTIONAL_MODULES = [[[[512, 1], [128, 1]] + 6 * [[32, 3]] + [(16, 8)]]]


# Create the Estimator
Expand Down Expand Up @@ -299,7 +294,7 @@ def main(unused_par):


validation_hook = ann_h.ValidationRunHook(
step_increment=BATCHES_IN_TRAINING_EPOCH//3,
step_increment=BATCHES_IN_TRAINING_EPOCH,
estimator=the_estimator,
input_fn_creator=lambda: lambda : lower_diag_score_comparison_input_fn(
VALIDATION_FILENAME_PATTERN,
Expand All @@ -318,8 +313,9 @@ def main(unused_par):
shuffle_buffer_size=TRAINING_SHUFFLE_BUFFER_SIZE,
include_unoccupied=NUM_INPUT_FILTERS == 16,
num_things_in_parallel=12,
num_things_to_prefetch=1,
num_things_to_prefetch=36,
data_format=DATA_FORMAT,
shuffle_seed=12312312,
),
hooks=[validation_hook],
# max_steps=1,
Expand Down
Loading

0 comments on commit 90eee66

Please sign in to comment.