Significant Speed Improvements

- Significantly increased JIT compilation coverage (Currently the only thing preventing complete compilation is the TensorFlow calls) - Changed how boards are given from CPU to GPU, now using between 80-200 bits per board (depending on occupancy), opposed to the previous 480 - Now using a fully JIT compiled PriorityBin implementation, by utilizing the linked list system used in the negamax search. - Added the classes_and_structs.py file to hold JitClasses and Numpy structured scaler types (and basic functions involving them) - Improved the test for ANN inference - Various minor refactoring and code cleaning
SamRagusa · Nov 22, 2018 · 90eee66 · 90eee66
1 parent a2eed16
commit 90eee66
Show file tree

Hide file tree

Showing 13 changed files with 822 additions and 792 deletions.
diff --git a/batch_first/__init__.py b/batch_first/__init__.py
@@ -48,7 +48,6 @@ def get_table_and_array_for_set_of_dicts(dicts):
     return index_lookup_table, array
 
 
-
 def generate_move_filter_table():
     """
     Generate a lookup table for the policy encoding described in the following paper:
@@ -93,7 +92,6 @@ def generate_move_filter_table():
     return filter_table
 
 
-
 MOVE_FILTER_LOOKUP = generate_move_filter_table()
 
 
@@ -171,7 +169,6 @@ def generate_move_filter_table():
     LOSS_RESULT_SCORES[j] = np.nextafter(LOSS_RESULT_SCORES[j - 1], MAX_FLOAT32_VAL)
 
 
-
 SIZE_EXPONENT_OF_TWO_FOR_TT_INDICES = np.uint8(30)
 TT_HASH_MASK = np.uint64(2 ** (SIZE_EXPONENT_OF_TWO_FOR_TT_INDICES) - 1)
 
@@ -297,11 +294,11 @@ def power_set(iterable):
 flip_vert_const_1 = np.uint64(0x00FF00FF00FF00FF)
 flip_vert_const_2 = np.uint64(0x0000FFFF0000FFFF)
 
-@nb.vectorize([nb.uint64(nb.uint64)])
-def vectorized_flip_vertically(bb):
-    bb = ((bb >> 8) & flip_vert_const_1) | ((bb & flip_vert_const_1) << 8)
+@nb.vectorize([nb.uint64(nb.uint64)], nopython=True)
+def flip_vertically(bb):
+    bb = ((bb >>  8) & flip_vert_const_1) | ((bb & flip_vert_const_1) <<  8)
     bb = ((bb >> 16) & flip_vert_const_2) | ((bb & flip_vert_const_2) << 16)
-    bb = (bb >> 32) | (bb << 32)
+    bb = ( bb >> 32) | ( bb << 32)
     return bb
 
 def get_castling_lookup_tables():
@@ -310,7 +307,7 @@ def get_castling_lookup_tables():
         possible_castling_rights[j] = np.uint64(functools.reduce(lambda x, y: x | y, set, np.uint64(0)))
 
     white_turn_castling_tables = create_index_table(possible_castling_rights)
-    black_turn_castling_tables = create_index_table(vectorized_flip_vertically(possible_castling_rights))
+    black_turn_castling_tables = create_index_table(flip_vertically(possible_castling_rights))
 
     return white_turn_castling_tables, black_turn_castling_tables, possible_castling_rights
 

diff --git a/batch_first/anns/ann_creation_helper.py b/batch_first/anns/ann_creation_helper.py
@@ -4,16 +4,86 @@
 from tensorflow.contrib import layers
 from tensorflow.python import training
 
+import chess
+
 from functools import reduce
 
-from batch_first.chestimator import get_board_data
 from google.protobuf import text_format
 
+from batch_first.numba_board import popcount
+
+
 from tensorflow.contrib import tensorrt as trt
 
 
 
 
+def parse_into_ann_input_inference(max_boards, convert_to_nhwc=False):
+    """
+    NOTES:
+    1) If a constant/operation is typed in a confusing manor, it's so the entirely of this can be done on GPU
+    """
+    possible_lookup_nums = np.arange(2 ** 16, dtype=np.uint16)
+    num_bits = popcount(possible_lookup_nums.astype(np.uint64))
+
+    location_lookup_ary = np.array([[[chess.square_rank(loc), chess.square_file(loc)] for loc in chess.SQUARES_180]], np.int32)
+    location_lookup_ary = np.ones([max_boards, 1, 1], np.int32) * location_lookup_ary
+
+    location_lookup_ary = location_lookup_ary.reshape([max_boards, 8, 8, 2])[:, ::-1]
+    location_lookup_ary = location_lookup_ary.reshape([max_boards, 4, 16, 2])
+
+    mask_getter = lambda n: np.unpackbits(np.frombuffer(n, dtype=np.uint8)[::-1])[::-1]
+    masks_to_gather_ary = np.array(list(map(mask_getter, possible_lookup_nums)), dtype=np.bool_)
+
+    pieces_from_nums = lambda n: [n >> 4, (n & np.uint8(0x0F))]
+    piece_lookup_ary = np.array(list(map(pieces_from_nums, possible_lookup_nums)), dtype=np.int32)
+
+    range_repeater = numpy_style_repeat_1d_creator(max_multiple=33, max_to_repeat=max_boards, out_type=tf.int64)
+
+    popcount_lookup = tf.constant(num_bits, tf.int64)
+    locations_for_masking = tf.constant(location_lookup_ary, tf.int64)
+    occupancy_mask_table = tf.constant(masks_to_gather_ary, tf.half)
+    piece_lookup_table = tf.constant(piece_lookup_ary, tf.int64)
+
+    ones_to_slice = tf.constant(np.ones(33 * max_boards), dtype=tf.float32)  # This is used since there seems to be no simple/efficient way to broadcast for scatter_nd
+
+    piece_indicators = tf.placeholder(tf.int32, shape=[None], name="piece_filters")  #Given as an array of uint8s
+    occupied_bbs = tf.placeholder(tf.int64, shape=[None], name="occupied_bbs")       #Given as an array of uint64s
+
+    # The code below this comment defines ops which are run during inference
+
+    occupied_bitcasted = tf.cast(tf.bitcast(occupied_bbs, tf.uint16), dtype=tf.int32)
+
+    partial_popcounts = tf.gather(popcount_lookup, occupied_bitcasted, "byte_popcount_loopkup")
+    partial_popcounts = tf.cast(partial_popcounts, tf.int32)
+    occupied_popcounts = tf.reduce_sum(partial_popcounts, axis=-1, name="popcount_lookup_sum")
+
+    location_mask = tf.gather(occupancy_mask_table, occupied_bitcasted, "gather_location_mask")
+    location_mask = tf.cast(location_mask, tf.bool)
+    piece_coords = tf.boolean_mask(locations_for_masking, location_mask, "mask_desired_locations")
+
+    gathered_pieces = tf.gather(piece_lookup_table, piece_indicators, "gather_pieces")
+    piece_filter_indices = tf.reshape(gathered_pieces, [-1, 1])
+
+    repeated_board_numbers = range_repeater(occupied_popcounts)
+    board_numbers_for_concat = tf.expand_dims(repeated_board_numbers, -1)
+
+    # Removes either the last piece filter, or no filters (based on if the number of filters was odd and half of the final uint8 was padding)
+    piece_filter_indices = piece_filter_indices[:tf.shape(board_numbers_for_concat)[0]]
+
+    one_indices = tf.concat([board_numbers_for_concat, piece_filter_indices, piece_coords], axis=-1) #Should figure out how this can be done with (or similarly to) tf.parallel_stack
+
+    boards = tf.scatter_nd(
+        indices=one_indices,
+        updates=ones_to_slice[:tf.shape(one_indices)[0]],
+        shape=[tf.shape(occupied_bbs, out_type=tf.int64)[0], 15, 8, 8])
+
+    if convert_to_nhwc:
+        boards = tf.transpose(boards, [0,2,3,1])
+
+    return (piece_indicators, occupied_bbs), boards
+
+
 def vec_and_transpose_op(vector, operation, output_type=None):
     """
     Equivalent to running tf.cast(operation(tf.expand_dims(vector, 1), tf.expand_dims(vector, 0)), output_type)
@@ -76,9 +146,11 @@ def combine_graphdefs(graphdef_filenames, output_model_path, output_filename, ou
             output_filename)
 
 
-def remap_inputs(model_path, output_model_path, output_filename):
+def remap_inputs(model_path, output_model_path, output_filename, max_batch_size=None):
     with tf.Session() as sess:
-        placeholders, formatted_data = get_board_data()
+        with tf.device('/GPU:0'):
+            with tf.name_scope("input_parser"):
+                placeholders, formatted_data = parse_into_ann_input_inference(max_batch_size)
 
         with open(model_path, 'r') as f:
             graph_def = text_format.Parse(f.read(), tf.GraphDef())
@@ -98,10 +170,10 @@ def remap_inputs(model_path, output_model_path, output_filename):
 
 def save_trt_graphdef(model_path, output_model_path, output_filename, output_node_names,
                       trt_memory_fraction=.5, total_video_memory=1.1e10,
-                      max_batch_size=1000, write_as_text=True, ):
+                      max_batch_size=1000, write_as_text=True):
 
-    #This would ideally be 1 instead of .75, but the GPU that this is running on is responsible for things like graphics
-    with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=.75 - trt_memory_fraction))) as sess:
+    #This would ideally be 1 instead of .85, but the GPU that this is running on is responsible for things like graphics
+    with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=.85 - trt_memory_fraction))) as sess:
 
         with open(model_path, 'r') as f:
             txt = f.read()
@@ -413,20 +485,23 @@ def metric_dict_creator(the_dict):
     return metric_dict
 
 
-def numpy_style_repeat_1d_creator(max_multiple=100, max_to_repeat=10000):
+def numpy_style_repeat_1d_creator(max_multiple=100, max_to_repeat=10000, out_type=tf.int32):
     board_num_lookup_ary = np.repeat(
         np.arange(max_to_repeat),
         np.full([max_to_repeat], max_multiple))
     board_num_lookup_ary = board_num_lookup_ary.reshape(max_to_repeat, max_multiple)
 
     def fn_to_return(multiples):
-        board_num_lookup_tensor = tf.constant(board_num_lookup_ary, dtype=tf.int32)
-        casted_multiples = tf.cast(multiples, dtype=tf.int32)
+        board_num_lookup_tensor = tf.constant(board_num_lookup_ary, dtype=out_type)
+
+        if multiples.dtype != tf.int32:
+            multiples = tf.cast(multiples, dtype=tf.int32)
+
         padded_multiples = tf.pad(
-            casted_multiples,
+            multiples,
             [[0, max_to_repeat - tf.shape(multiples)[0]]])
 
-        padded_multiples =tf.cast(padded_multiples, tf.float32)
+        padded_multiples = tf.cast(padded_multiples, tf.int32)
         to_return =  tf.boolean_mask(
             board_num_lookup_tensor,
             tf.sequence_mask(padded_multiples, maxlen=max_multiple))
@@ -435,7 +510,6 @@ def fn_to_return(multiples):
     return fn_to_return
 
 
-
 def count_tfrecords(filename):
     return sum(1 for _ in tf.python_io.tf_record_iterator(filename))
 

diff --git a/batch_first/anns/evaluation_ann.py b/batch_first/anns/evaluation_ann.py
@@ -4,16 +4,14 @@
 
 import batch_first.anns.ann_creation_helper as ann_h
 
-from batch_first.chestimator import get_board_data
-
 tf.logging.set_verbosity(tf.logging.INFO)
 
 
 
 def diag_comparison_model_fn(features, labels, mode, params):
     """
     Generates an EstimatorSpec for a model which scores chess boards.  It learns by maximizing the difference between
-    two board evaluations, where one is intended to be greater than the other based on some pre-calculated
+    board evaluation values, where one is intended to be greater than the other based on some pre-calculated
     scoring system (e.g. StockFish evaluations).
     """
     convolutional_module_outputs=ann_h.create_input_convolutions_shared_weights(
@@ -45,7 +43,7 @@ def diag_comparison_model_fn(features, labels, mode, params):
         kernel_regularizer=params['kernel_regularizer'](),
         name="logit_layer")
 
-    logits = tf.squeeze(logits)
+    logits = tf.squeeze(logits, axis=[1,2,3])
 
     loss = None
     train_op = None
@@ -188,28 +186,29 @@ def process_batch(records):
 
         comparison_indices = tf.where(bool_weight_mask)
 
-        value_larger_than_centipawn_less_than_mate = 100000
-        desired_found_mate = tf.greater(tf.abs(parsed_examples['score']), value_larger_than_centipawn_less_than_mate)
+        # value_larger_than_centipawn_less_than_mate = 100000
+        # desired_found_mate = tf.greater(tf.abs(parsed_examples['score']), value_larger_than_centipawn_less_than_mate)
 
-        both_found_mate = ann_h.vec_and_transpose_op(desired_found_mate, tf.logical_and)
+        # both_found_mate = ann_h.vec_and_transpose_op(desired_found_mate, tf.logical_and)
 
-        desired_signs = tf.sign(parsed_examples['score'])
+        # desired_signs = tf.sign(parsed_examples['score'])
 
-        same_sign_matrix = ann_h.vec_and_transpose_op(desired_signs, tf.equal)
+        # same_sign_matrix = ann_h.vec_and_transpose_op(desired_signs, tf.equal)
 
-        both_same_player_mates = tf.logical_and(both_found_mate, same_sign_matrix)
+        # both_same_player_mates = tf.logical_and(both_found_mate, same_sign_matrix)
 
-        both_same_mate_and_nonzero_weight = tf.logical_and(both_same_player_mates, bool_weight_mask)
+        # both_same_mate_and_nonzero_weight = tf.logical_and(both_same_player_mates, bool_weight_mask)
 
-        same_mate_depth_diff_decrement = .95
+        # same_mate_depth_diff_decrement = .95
 
-        weight_helper = same_mate_depth_diff_decrement * tf.cast(both_same_mate_and_nonzero_weight, tf.float32)
+        # weight_helper = same_mate_depth_diff_decrement * tf.cast(both_same_mate_and_nonzero_weight, tf.float32)
 
-        mate_adjusted_weight_mask = weight_mask - weight_helper
+        # mate_adjusted_weight_mask = weight_mask - weight_helper
 
         label_matrix = (lower_diag_sign + weight_mask)/2
 
-        return boards, parsed_examples['score'], label_matrix, mate_adjusted_weight_mask, comparison_indices
+        # return boards, parsed_examples['score'], label_matrix, mate_adjusted_weight_mask, comparison_indices
+        return boards, parsed_examples['score'], label_matrix, weight_mask, comparison_indices
 
 
     dataset = dataset.map(process_batch, num_parallel_calls=num_things_in_parallel)
@@ -227,31 +226,27 @@ def process_batch(records):
 
 def board_eval_serving_input_receiver(data_format="NCHW"):
     def fn_to_return():
-        (piece_bbs, color_occupied_bbs, ep_squares, castling_lookup_indices, kings), formatted_data = get_board_data(data_format)
-
-        receiver_tensors = {"piece_bbs": piece_bbs,
-                            "color_occupied_bbs": color_occupied_bbs,
-                            "ep_squares": ep_squares,
-                            "castling_lookup_indices": castling_lookup_indices,
-                            "kings": kings}
+        placeholder_shape = [None, 15, 8, 8] if data_format=="NCHW" else [None, 8, 8, 15]
 
-        dict_for_model_fn = {"board": formatted_data}
+        for_remapping = tf.placeholder(tf.float32, placeholder_shape, "FOR_INPUT_MAPPING_transpose")
 
-        return tf.estimator.export.ServingInputReceiver(dict_for_model_fn, receiver_tensors)
+        receiver_tensors = {"board": for_remapping}
+        return tf.estimator.export.ServingInputReceiver(receiver_tensors, receiver_tensors)
     return fn_to_return
 
 
 def main(unused_par):
-    SAVE_MODEL_DIR = "/srv/tmp/diag_loss_3/pre_commit_test"
+
+    SAVE_MODEL_DIR = "/srv/tmp/diag_loss_3/pre_commit_test_2534111"
     TRAINING_FILENAME_PATTERN = "/srv/databases/lichess_combined_methods_eval_databases/lichess_training.tfrecords"
     VALIDATION_FILENAME_PATTERN = "/srv/databases/lichess_combined_methods_eval_databases/lichess_validation.tfrecords"
     TRAIN_OP_SUMMARIES = ["gradient_norm", "gradients"]
     NUM_INPUT_FILTERS = 15
     OPTIMIZER = 'Adam'
-    TRAINING_SHUFFLE_BUFFER_SIZE = 32790000
+    TRAINING_SHUFFLE_BUFFER_SIZE = 16800000
     TRAINING_BATCH_SIZE = 512      #The effective batch size used for the loss = n(n-1)/2  (where n is the number of boards in the batch)
     VALIDATION_BATCH_SIZE = 1000
-    LOG_ITERATION_INTERVAL = 1500
+    LOG_ITERATION_INTERVAL = 2500
     LEARNING_RATE = 2.5e-3
     KERNEL_REGULARIZER = lambda: None
     KERNEL_INITIALIZER = lambda: tf.contrib.layers.variance_scaling_initializer()
@@ -260,8 +255,8 @@ def main(unused_par):
     SAME_MATE_DEPTH_DIFF_LOSS_WEIGHT_DECREMENT = .95
     VALUE_LARGER_THAN_CENTIPAWN_LESS_THAN_MATE = 100000
 
-    num_examples_in_training_file = 32792847
-    num_examples_in_validation_file = 3857982
+    num_examples_in_training_file = 16851682
+    num_examples_in_validation_file = 1982551
 
     BATCHES_IN_TRAINING_EPOCH = num_examples_in_training_file // TRAINING_BATCH_SIZE
     BATCHES_IN_VALIDATION_EPOCH =  num_examples_in_validation_file // VALIDATION_BATCH_SIZE
@@ -270,7 +265,7 @@ def main(unused_par):
     learning_decay_function = lambda gs : tf.train.exponential_decay(LEARNING_RATE, gs,
                                                                      BATCHES_IN_TRAINING_EPOCH, 0.96, staircase=True)
 
-    CONVOLUTIONAL_MODULES = [[[[512, 1], [128, 1]] + 4 * [[32, 3]] + [(16, 8)]]]
+    CONVOLUTIONAL_MODULES = [[[[512, 1], [128, 1]] + 6 * [[32, 3]] + [(16, 8)]]]
 
 
     # Create the Estimator
@@ -299,7 +294,7 @@ def main(unused_par):
 
 
     validation_hook = ann_h.ValidationRunHook(
-        step_increment=BATCHES_IN_TRAINING_EPOCH//3,
+        step_increment=BATCHES_IN_TRAINING_EPOCH,
         estimator=the_estimator,
         input_fn_creator=lambda: lambda : lower_diag_score_comparison_input_fn(
             VALIDATION_FILENAME_PATTERN,
@@ -318,8 +313,9 @@ def main(unused_par):
             shuffle_buffer_size=TRAINING_SHUFFLE_BUFFER_SIZE,
             include_unoccupied=NUM_INPUT_FILTERS == 16,
             num_things_in_parallel=12,
-            num_things_to_prefetch=1,
+            num_things_to_prefetch=36,
             data_format=DATA_FORMAT,
+            shuffle_seed=12312312,
         ),
         hooks=[validation_hook],
         # max_steps=1,