diff --git a/pluribus/games/short_deck/agent.py b/pluribus/games/short_deck/agent.py new file mode 100644 index 00000000..c93cf4cf --- /dev/null +++ b/pluribus/games/short_deck/agent.py @@ -0,0 +1,30 @@ +import collections +import joblib + + +class Agent: + """Agent class can hold a trained strategy and regret""" + def __init__(self, regret_path=None): + self.strategy = collections.defaultdict( + lambda: collections.defaultdict(lambda: 0) + ) + if regret_path: + offline_strategy = joblib.load(regret_path) + self.regret = collections.defaultdict( + lambda: collections.defaultdict(lambda: 0), + offline_strategy['regret'] + ) + else: + self.regret = collections.defaultdict( + lambda: collections.defaultdict(lambda: 0) + ) + self.tmp_regret = collections.defaultdict( + lambda: collections.defaultdict(lambda: 0) + ) + + def reset_new_regret(self): + """Remove regret from temporary storage""" + del self.tmp_regret + self.tmp_regret = collections.defaultdict( + lambda: collections.defaultdict(lambda: 0) + ) diff --git a/pluribus/games/short_deck/state.py b/pluribus/games/short_deck/state.py index af2e08b7..ecb2d273 100644 --- a/pluribus/games/short_deck/state.py +++ b/pluribus/games/short_deck/state.py @@ -7,8 +7,11 @@ import operator import os from typing import Any, Dict, List, Optional, Tuple +from itertools import combinations +import random import dill as pickle +import numpy as np from pluribus import utils from pluribus.poker.card import Card @@ -32,7 +35,8 @@ def new_game( ] if info_set_lut: # Don't reload massive files, it takes ages. - state = ShortDeckPokerState(players=players, load_pickle_files=False, **kwargs) + state = ShortDeckPokerState(players=players, + load_pickle_files=False, **kwargs) state.info_set_lut = info_set_lut else: # Load massive files. @@ -54,6 +58,8 @@ def __init__( big_blind: int = 100, pickle_dir: str = ".", load_pickle_files: bool = True, + real_time_test: bool = False, + public_cards: List[Card] = [] ): """Initialise state.""" n_players = len(players) @@ -74,6 +80,7 @@ def __init__( self._initial_n_chips = players[0].n_chips self.small_blind = small_blind self.big_blind = big_blind + self.real_time_test = real_time_test self._poker_engine = PokerEngine( table=self._table, small_blind=small_blind, big_blind=big_blind ) @@ -81,9 +88,13 @@ def __init__( # this), assign blinds to the players. self._poker_engine.round_setup() # Deal private cards to players. - self._table.dealer.deal_private_cards(self._table.players) + if not self.real_time_test: + self._poker_engine.table.dealer.deal_private_cards( + self._table.players + ) # Store the actions as they come in here. self._history: Dict[str, List[str]] = collections.defaultdict(list) + self._public_information: Dict[str, List[Card]] = collections.defaultdict(list) self._betting_stage = "pre_flop" self._betting_stage_to_round: Dict[str, int] = { "pre_flop": 0, @@ -107,11 +118,19 @@ def __init__( "terminal": player_i_order, } self._skip_counter = 0 - self._first_move_of_current_round = True + # self._first_move_of_current_round = True self._reset_betting_round_state() for player in self.players: player.is_turn = False self.current_player.is_turn = True + if public_cards: + assert len(public_cards) in {3, 4, 5} + self._public_cards = public_cards + self._final_action = None + # only want to do these actions in real game play, as they are slow + if self.real_time_test: + # must have offline strategy loaded up + self._starting_hand_probs = self._initialize_starting_hands() def __repr__(self): """Return a helpful description of object in strings and debugger.""" @@ -145,7 +164,6 @@ def apply_action(self, action_str: Optional[str]) -> ShortDeckPokerState: new_state.info_set_lut = self.info_set_lut = lut # An action has been made, so alas we are not in the first move of the # current betting round. - new_state._first_move_of_current_round = False if action_str is None: # Assert active player has folded already. assert ( @@ -189,7 +207,6 @@ def apply_action(self, action_str: Optional[str]) -> ShortDeckPokerState: # stage of the game. new_state._increment_stage() new_state._reset_betting_round_state() - new_state._first_move_of_current_round = True if not new_state.current_player.is_active: new_state._skip_counter += 1 assert not new_state.current_player.is_active @@ -209,6 +226,63 @@ def apply_action(self, action_str: Optional[str]) -> ShortDeckPokerState: new_state.current_player.is_turn = True return new_state + def load_game_state(self, offline_strategy: Dict[str, Dict[str, float]], + action_sequence: list): + """ + Follow through the action sequence provided to get current node. + :param action_sequence: List of actions without 'skip' + """ + if 'skip' in set(action_sequence): + action_sequence = [a for a in action_sequence if a != 'skip'] + if len(action_sequence) == 1: + # TODO: Not sure if I need to deepcopy + betting_stage = self.betting_stage + public_cards = self._public_cards + # Must declare the appropriate amount of public cards for RTS.. + assert self._public_information[betting_stage] == public_cards + lut = self.info_set_lut + self.info_set_lut = {} + new_state = copy.deepcopy(self) + new_state.info_set_lut = self.info_set_lut = lut + new_state._final_action = action_sequence.pop(0) + new_state._update_hole_cards_bayes(offline_strategy) + return new_state + a = action_sequence.pop(0) + new_state = self.apply_action(a) + return new_state.load_game_state(offline_strategy, action_sequence) + + def deal_bayes(self): + # Deep copy the parts of state that are needed that must be immutable + # from state to state. + lut = self.info_set_lut + self.info_set_lut = {} + new_state = copy.deepcopy(self) + new_state.info_set_lut = self.info_set_lut = lut + players = list(range(len(new_state.players))) + random.shuffle(players) + cards_selected = [] + # TODO: This would be better by selecting the first player's + # cards, then normalizing the second and third, etc.. + for p_i in players: + starting_hand = new_state._get_starting_hand(p_i) + len_union = len(set(starting_hand).union(set(cards_selected))) + len_individual = len(starting_hand) + len(cards_selected) + while len_union < len_individual: + starting_hand = new_state._get_starting_hand(p_i) + len_union = len(set(starting_hand).union(set(cards_selected))) + len_individual = len(starting_hand) + len(cards_selected) + # TODO: pull this into a helper method, maybe it should + # be in the dealer class.. + for card in starting_hand: + new_state.players[p_i].add_private_card(card) + cards_selected += starting_hand + cards_selected += new_state._public_cards + for card in cards_selected: + new_state._table.dealer.deck.remove(card) + final_action = new_state._final_action + newest_state = new_state.apply_action(final_action) + return newest_state + @staticmethod def load_pickle_files(pickle_dir: str) -> Dict[str, Dict[Tuple[int, ...], str]]: """Load pickle files into memory.""" @@ -254,15 +328,36 @@ def _increment_stage(self): if self._betting_stage == "pre_flop": # Progress from private cards to the flop. self._betting_stage = "flop" - self._poker_engine.table.dealer.deal_flop(self._table) + if len(self._public_cards) >= 3: + community_cards = self._public_cards[:3] + self._poker_engine.table.community_cards += community_cards + else: + self._poker_engine.table.dealer.deal_flop(self._table) + self._public_information[ + self.betting_stage + ] = self._table.community_cards.copy() elif self._betting_stage == "flop": # Progress from flop to turn. self._betting_stage = "turn" - self._poker_engine.table.dealer.deal_turn(self._table) + if len(self._public_cards) >= 4: + community_cards = self._public_cards[3:4] + self._poker_engine.table.community_cards += community_cards + else: + self._poker_engine.table.dealer.deal_turn(self._table) + self._public_information[ + self.betting_stage + ] = self._table.community_cards.copy() elif self._betting_stage == "turn": # Progress from turn to river. self._betting_stage = "river" - self._poker_engine.table.dealer.deal_river(self._table) + if len(self._public_cards) == 5: + community_cards = self._public_cards[4:] + self._poker_engine.table.community_cards += community_cards + else: + self._poker_engine.table.dealer.deal_river(self._table) + self._public_information[ + self.betting_stage + ] = self._table.community_cards.copy() elif self._betting_stage == "river": # Progress to the showdown. self._betting_stage = "show_down" @@ -271,6 +366,200 @@ def _increment_stage(self): else: raise ValueError(f"Unknown betting_stage: {self._betting_stage}") + def _initialize_starting_hands(self) -> Dict[int, Dict[List[Card], float]]: + """Dictionary of starting hands to store probabilities in""" + assert self.betting_stage == "pre_flop" + starting_hand_probs: Dict = {} + n_players = len(self.players) + starting_hands = self._get_card_combos(2) + for p_i in range(n_players): + starting_hand_probs[p_i] = {} + for starting_hand in starting_hands: + starting_hand_probs[p_i][ + starting_hand + ] = 1 + return starting_hand_probs + + def _get_card_combos(self, num_cards) -> List[Tuple[Any, ...]]: + """Get combinations of cards""" + return list(combinations(self.cards_in_deck, num_cards)) + + def _normalize_bayes(self): + """Normalize probability of reach for each player""" + n_players = len(self.players) + for p_i in range(n_players): + total_prob = sum(self._starting_hand_probs[p_i].values()) + for starting_hand, prob in self._starting_hand_probs[p_i].items(): + self._starting_hand_probs[p_i][starting_hand] = prob / total_prob + + def _update_hole_cards_bayes(self, offline_strategy: Dict[str, Dict[str, + float]]): + """Get probability of reach for each starting hand for each player""" + assert self._history + n_players = len(self._table.players) + player_indices: List[int] = [p_i for p_i in range(n_players)] + for p_i in player_indices: + # TODO: Might make since to put starting hands in the deck class + for starting_hand in self._starting_hand_probs[p_i].keys(): + starting_hand = list( + starting_hand + ) + # TODO: Is this bad? + if "p_reach" in locals(): + del p_reach + action_sequence: Dict[str, List[str]] = collections.defaultdict(list) + for idx, betting_stage in enumerate(self._history.keys()): + n_actions_round = len(self._history[betting_stage]) + for i in range(n_actions_round): + action = self._history[betting_stage][i] + while action == 'skip': + i += 1 # Action sequences don't end in skip + action = self._history[betting_stage][i] + # TODO: Maybe a method already exists for this? + if betting_stage == "pre_flop": + ph = (i + 2) % n_players + else: + ph = i % n_players + if p_i != ph: + prob_reach_all_hands = [] + for opp_starting_hand in self._starting_hand_probs[ + p_i + ].keys(): + opp_starting_hand = list( + opp_starting_hand + ) + publics = self._public_information[betting_stage] + if len( + set(opp_starting_hand).union( + set(publics) + ).union(set(starting_hand)) + ) < len( + opp_starting_hand + ) + len( + starting_hand + ) + len( + publics + ): + prob = 0 + else: + publics = self._public_information[ + betting_stage + ] + infoset = self._info_set_builder( + hole_cards=opp_starting_hand, + public_cards=publics, + history=action_sequence, + this_betting_stage=betting_stage, + ) + # Check to see if the strategy exists, + # if not equal probability + # TODO: is this overly hacky? + # Problem with defaulting to 1 / 3, is that it + # it doesn't work for calculations that + # need to be made with the object's values + try: + prob = offline_strategy[infoset][action] + # Normalizing unnormalized offline_stregy + prob /= sum(offline_strategy[infoset]\ + .values()) + except KeyError: + prob = 1 / len(self.legal_actions) + prob_reach_all_hands.append(prob) + total_opp_prob_h = sum(prob_reach_all_hands) /\ + len(prob_reach_all_hands) + if "p_reach" not in locals(): + p_reach = total_opp_prob_h + else: + p_reach *= total_opp_prob_h + elif p_i == ph: + publics = self._public_information[betting_stage] + if len( + set(starting_hand).union( + set(publics) + ) + ) < ( + len(publics) + 2 + ): + total_prob = 0 + else: + publics = self._public_information[betting_stage] + infoset = self._info_set_builder( + hole_cards=starting_hand, + public_cards=publics, + history=action_sequence, + this_betting_stage=betting_stage, + ) + try: + total_prob = offline_strategy[infoset][action] + # Normalizing unnormalized offline_stregy + total_prob /= sum(offline_strategy[infoset]\ + .values()) + except KeyError: + total_prob = 1 / len(self.legal_actions) + if "p_reach" not in locals(): + p_reach = total_prob + else: + p_reach *= total_prob + action_sequence[betting_stage].append(action) + self._starting_hand_probs[p_i][tuple(starting_hand)] = p_reach + self._normalize_bayes() + + def _get_starting_hand(self, player_idx: int) -> List[Card]: + """Get starting hand based on probability of reach""" + starting_hands = list(self._starting_hand_probs[player_idx].keys()) + starting_hands_idxs = list(range(len(starting_hands))) + starting_hands_probs = list(self._starting_hand_probs[ + player_idx + ].values()) + starting_hand_idx = np.random.choice( + starting_hands_idxs, + 1, + p=starting_hands_probs + )[0] + starting_hand = list(starting_hands[starting_hand_idx]) + return starting_hand + + def _info_set_builder(self, hole_cards=None, public_cards=None, + history=None, this_betting_stage=None) -> str: + """Get the information set for the current player.""" + if hole_cards is None: + hole_cards = self.current_player.cards + if public_cards is None: + public_cards = self._table.community_cards + if history is None: + history = self._history + if this_betting_stage is None: + this_betting_stage = self._betting_stage + cards = sorted( + hole_cards, + key=operator.attrgetter("eval_card"), + reverse=True, + ) + cards += sorted( + public_cards, + key=operator.attrgetter("eval_card"), + reverse=True, + ) + eval_cards = tuple([int(card) for card in cards]) + try: + cards_cluster = self.info_set_lut[this_betting_stage][eval_cards] + except KeyError: + import ipdb; + ipdb.set_trace() + return "default info set, please ensure you load it correctly" + # Convert history from a dict of lists to a list of dicts as I'm + # paranoid about JSON's lack of care with insertion order. + info_set_dict = { + "cards_cluster": cards_cluster, + "history": [ + {betting_stage: [str(action) for action in actions]} + for betting_stage, actions in history.items() + ], + } + return json.dumps( + info_set_dict, separators=(",", ":"), cls=utils.io.NumpyJSONEncoder + ) + @property def community_cards(self) -> List[Card]: """Return all shared/public cards.""" @@ -281,6 +570,11 @@ def private_hands(self) -> Dict[ShortDeckPokerPlayer, List[Card]]: """Return all private hands.""" return {p: p.cards for p in self.players} + @property + def cards_in_deck(self): + """Returns current cards in deck""" + return self._table.dealer.deck._cards_in_deck + @property def initial_regret(self) -> Dict[str, float]: """Returns the default regret for this state.""" @@ -314,11 +608,11 @@ def player_i(self) -> int: @player_i.setter def player_i(self, _: Any): """Raise an error if player_i is set.""" - raise ValueError(f"The player_i property should not be set.") + raise ValueError("The player_i property should not be set.") @property def betting_round(self) -> int: - """Algorithm 1 of pluribus supp. material references betting_round.""" + """Return 0 indexed betting round""" try: betting_round = self._betting_stage_to_round[self._betting_stage] except KeyError: @@ -332,33 +626,7 @@ def betting_round(self) -> int: @property def info_set(self) -> str: """Get the information set for the current player.""" - cards = sorted( - self.current_player.cards, - key=operator.attrgetter("eval_card"), - reverse=True, - ) - cards += sorted( - self._table.community_cards, - key=operator.attrgetter("eval_card"), - reverse=True, - ) - eval_cards = tuple([card.eval_card for card in cards]) - try: - cards_cluster = self.info_set_lut[self._betting_stage][eval_cards] - except KeyError: - return "default info set, please ensure you load it correctly" - # Convert history from a dict of lists to a list of dicts as I'm - # paranoid about JSON's lack of care with insertion order. - info_set_dict = { - "cards_cluster": cards_cluster, - "history": [ - {betting_stage: [str(action) for action in actions]} - for betting_stage, actions in self._history.items() - ], - } - return json.dumps( - info_set_dict, separators=(",", ":"), cls=utils.io.NumpyJSONEncoder - ) + return self._info_set_builder() @property def payout(self) -> Dict[int, int]: diff --git a/pluribus/poker/card.py b/pluribus/poker/card.py index 5fe30a61..a3fc6db7 100644 --- a/pluribus/poker/card.py +++ b/pluribus/poker/card.py @@ -74,6 +74,9 @@ def __eq__(self, other): def __ne__(self, other): return int(self) != int(other) + def __hash__(self): + return hash(int(self)) + @property def eval_card(self) -> EvaluationCard: """Return an `EvaluationCard` for use in the `Evaluator`.""" @@ -178,4 +181,3 @@ def from_dict(x: Dict[str, Union[int, str]]): if set(x) != {"rank", "suit"}: raise NotImplementedError(f"Unrecognised dict {x}") return Card(rank=x["rank"], suit=x["suit"]) - diff --git a/pluribus/poker/deck.py b/pluribus/poker/deck.py index efc7e5f6..c6801105 100644 --- a/pluribus/poker/deck.py +++ b/pluribus/poker/deck.py @@ -61,3 +61,9 @@ def pick(self, random: bool = True) -> Card: card: Card = self._cards_in_deck.pop(index) self._dealt_cards.append(card) return card + + def remove(self, card): + """Remove a specific card from the deck""" + if card in self._cards_in_deck: + self._cards_in_deck.remove(card) + self._dealt_cards.append(card) diff --git a/research/blueprint_algo/blueprint_short_deck_poker.py b/research/blueprint_algo/blueprint_short_deck_poker.py index b2deeb34..ca3f89bb 100644 --- a/research/blueprint_algo/blueprint_short_deck_poker.py +++ b/research/blueprint_algo/blueprint_short_deck_poker.py @@ -204,7 +204,7 @@ def cfr(agent: Agent, state: ShortDeckPokerState, i: int, t: int) -> float: logging.debug(f"Got EV for {a}: {voa[a]}") vo += sigma[I][a] * voa[a] logging.debug( - f"""Added to Node EV for ACTION: {a} INFOSET: {I} + f"""Added to Node EV for ACTION: {a} INFOSET: {I} STRATEGY: {sigma[I][a]}: {sigma[I][a] * voa[a]}""" ) logging.debug(f"Updated EV at {I}: {vo}") @@ -346,16 +346,16 @@ def _create_dir() -> Path: @click.command() -@click.option("--strategy_interval", default=2, help=".") -@click.option("--n_iterations", default=10, help=".") -@click.option("--lcfr_threshold", default=80, help=".") -@click.option("--discount_interval", default=1000, help=".") -@click.option("--prune_threshold", default=4000, help=".") +@click.option("--strategy_interval", default=400, help=".") +@click.option("--n_iterations", default=5500, help=".") +@click.option("--lcfr_threshold", default=400, help=".") +@click.option("--discount_interval", default=400, help=".") +@click.option("--prune_threshold", default=400, help=".") @click.option("--c", default=-20000, help=".") @click.option("--n_players", default=3, help=".") -@click.option("--print_iteration", default=10, help=".") -@click.option("--dump_iteration", default=10, help=".") -@click.option("--update_threshold", default=0, help=".") +@click.option("--print_iteration", default=100, help=".") +@click.option("--dump_iteration", default=20, help=".") +@click.option("--update_threshold", default=400, help=".") def train( strategy_interval: int, n_iterations: int, diff --git a/research/rts/RT.py b/research/rts/RT.py new file mode 100644 index 00000000..457ef58d --- /dev/null +++ b/research/rts/RT.py @@ -0,0 +1,30 @@ +from typing import List +import joblib + +from RT_cfr import rts +from pluribus.poker.card import Card + + +if __name__ == "__main__": + # We can set public cards or not + public_cards = [Card("ace", "diamonds"), Card("king", "clubs"), + Card("jack", "spades"), Card("10", "hearts"), + Card("10", "spades")] + # Action sequence must be in old form (one list, includes skips) + action_sequence = ["raise", "raise", "raise", "call", "call", + "raise", "raise", "raise", "call", "call", + "raise", "raise", "raise", "call", "call", "call"] + agent_output, offline_strategy = rts( + 'test_strategy2/unnormalized_output/offline_strategy_1500.gz', + 'test_strategy2/strategy_1500.gz', public_cards, action_sequence, + 1400, 1, 1, 3, 1, 1, 20 + ) + save_path = "test_strategy2/unnormalized_output/" + last_regret = { + info_set: dict(strategy) + for info_set, strategy in agent_output.regret.items() + } + joblib.dump(offline_strategy, save_path + 'rts_output.gz', compress="gzip") + joblib.dump(last_regret, save_path + 'last_regret.gz', compress="gzip") + import ipdb; + ipdb.set_trace() diff --git a/research/rts/RT_cfr.py b/research/rts/RT_cfr.py new file mode 100644 index 00000000..1e1d596e --- /dev/null +++ b/research/rts/RT_cfr.py @@ -0,0 +1,203 @@ +from __future__ import annotations + +import collections +from typing import Dict, List +import joblib +from pathlib import Path + +from tqdm import trange +import numpy as np +import datetime +import yaml + +from pluribus import utils +from pluribus.games.short_deck.state import ShortDeckPokerState, new_game +from pluribus.games.short_deck.agent import Agent +from pluribus.poker.card import Card + + +def normalize_strategy(this_info_sets_regret: Dict[str, float]) -> Dict[str, float]: + """Calculate the strategy based on the current information sets regret.""" + actions = this_info_sets_regret.keys() + regret_sum = sum([max(regret, 0) for regret in this_info_sets_regret.values()]) + if regret_sum > 0: + strategy: Dict[str, float] = { + action: max(this_info_sets_regret[action], 0) / regret_sum + for action in actions + } + elif this_info_sets_regret == {}: + # Don't return strategy if no strategy was made + # during training + strategy: Dict[str, float] = {} + elif regret_sum == 0: + # Regret is negative, we learned something + default_probability = 1 / len(actions) + strategy: Dict[str, float] = {action: default_probability for action in actions} + return strategy + + +def calculate_strategy( + regret: Dict[str, Dict[str, float]], + I: str, + state: ShortDeckPokerState, +) -> Dict[str, Dict[str, float]]: + """ + Calculate strategy based on regret + """ + sigma = collections.defaultdict(lambda: collections.defaultdict(lambda: 1 / 3)) + rsum = sum([max(x, 0) for x in regret[I].values()]) + for a in state.legal_actions: + if rsum > 0: + sigma[I][a] = max(regret[I][a], 0) / rsum + else: + sigma[I][a] = 1 / len(state.legal_actions) + return sigma + + +def _create_dir(folder_id: str) -> Path: + """Create and get a unique dir path to save to using a timestamp.""" + time = str(datetime.datetime.now()) + for char in ":- .": + time = time.replace(char, "_") + path: Path = Path(f"./{folder_id}_results_{time}") + path.mkdir(parents=True, exist_ok=True) + return path + + +def cfr(agent: Agent, state: ShortDeckPokerState, i: int, t: int) -> float: + """ + CFR algo with the a temporary regret object for better strategy averaging + """ + ph = state.player_i + + player_not_in_hand = not state.players[i].is_active + if state.is_terminal or player_not_in_hand: + return state.payout[i] + + elif ph == i: + I = state.info_set + # Move regret over to temporary object and build off that + if agent.tmp_regret[I] == {}: + agent.tmp_regret[I] == agent.regret[I].copy() + sigma = calculate_strategy(agent.tmp_regret, I, state) + + vo = 0.0 + voa = {} + for a in state.legal_actions: + new_state: ShortDeckPokerState = state.apply_action(a) + voa[a] = cfr(agent, new_state, i, t) + vo += sigma[I][a] * voa[a] + + for a in state.legal_actions: + agent.tmp_regret[I][a] += voa[a] - vo + + return vo + else: + Iph = state.info_set + # Move regret over to a temporary object and build off that + if agent.tmp_regret[Iph] == {}: + agent.tmp_regret[Iph] == agent.regret[Iph].copy() + sigma = calculate_strategy(agent.tmp_regret, Iph, state) + + try: + a = np.random.choice( + list(sigma[Iph].keys()), 1, p=list(sigma[Iph].values()), + )[0] + except KeyError: + p = 1 / len(state.legal_actions) + probabilities = np.full(len(state.legal_actions), p) + a = np.random.choice(state.legal_actions, p=probabilities) + sigma[Iph] = {action: p for action in state.legal_actions} + except: + import ipdb; + ipdb.set_trace() + + new_state: ShortDeckPokerState = state.apply_action(a) + return cfr(agent, new_state, i, t) + + +def rts( + offline_strategy_path: str, + last_regret_path: str, + public_cards: list, + action_sequence: list, + n_iterations: int, + lcfr_threshold: int, + discount_interval: int, + n_players: int, + update_interval: int, + update_threshold: int, + dump_int: int, +): + """RTS.""" + config: Dict[str, int] = {**locals()} + save_path: Path = _create_dir('RTS') + with open(save_path / "config.yaml", "w") as steam: + yaml.dump(config, steam) + # TODO: fix the seed + # utils.random.seed(36) + agent = Agent(regret_path=last_regret_path) + # Load unnormalized strategy to build off + offline_strategy = joblib.load(offline_strategy_path) + state: ShortDeckPokerState = new_game( + 3, real_time_test=True, public_cards=public_cards + ) + # Load current game state + current_game_state: ShortDeckPokerState = state.load_game_state( + offline_strategy, action_sequence + ) + for t in trange(1, n_iterations + 1, desc="train iter"): + for i in range(n_players): # fixed position i + # Deal hole cards based on bayesian updating of hole card probs + state: ShortDeckPokerState = current_game_state.deal_bayes() + cfr(agent, state, i, t) + if t < lcfr_threshold & t % discount_interval == 0: + d = (t / discount_interval) / ((t / discount_interval) + 1) + for I in agent.tmp_regret.keys(): + for a in agent.tmp_regret[I].keys(): + agent.tmp_regret[I][a] *= d + # Add the unnormalized strategy into the original + # Right now assumes dump_int is a multiple of n_iterations + if t % dump_int == 0: + # Adding the regret back to the regret dict, we'll build off for + # next RTS + for I in agent.tmp_regret.keys(): + if agent.tmp_regret != {}: + agent.regret[I] = agent.tmp_regret[I].copy() + for info_set, this_info_sets_regret in sorted(agent.tmp_regret.items()): + # If this_info_sets_regret == {}, we do nothing + strategy = normalize_strategy(this_info_sets_regret) + # Check if info_set exists.. + no_info_set = info_set not in offline_strategy + if no_info_set or offline_strategy[info_set] == {}: + offline_strategy[info_set] = {a: 0 for a in strategy.keys()} + for action, probability in strategy.items(): + offline_strategy[info_set][action] += probability + agent.reset_new_regret() + + return agent, offline_strategy + + +if __name__ == "__main__": + # We can set public cards or not + public_cards = [Card("ace", "diamonds"), Card("king", "clubs"), + Card("jack", "spades"), Card("10", "hearts"), + Card("10", "spades")] + # Action sequence must be in old form (one list, includes skips) + action_sequence = ["raise", "raise", "raise", "call", "call", + "raise", "raise", "raise", "call", "call", + "raise", "raise", "raise", "call", "call", "call"] + agent_output, offline_strategy = rts( + 'test_strategy3/unnormalized_output/offline_strategy_1500.gz', + 'test_strategy3/strategy.gz', public_cards, action_sequence, + 1400, 1, 1, 3, 1, 1, 20 + ) + save_path = "test_strategy3/unnormalized_output/" + last_regret = { + info_set: dict(strategy) + for info_set, strategy in agent_output.regret.items() + } + joblib.dump(offline_strategy, save_path + 'rts_output.gz', compress="gzip") + joblib.dump(last_regret, save_path + 'last_regret.gz', compress="gzip") + import ipdb; + ipdb.set_trace() diff --git a/research/stat_tests/agent_test.py b/research/stat_tests/agent_test.py new file mode 100644 index 00000000..6569ac6e --- /dev/null +++ b/research/stat_tests/agent_test.py @@ -0,0 +1,162 @@ +from typing import List, Dict, DefaultDict +from pathlib import Path +import joblib +import collections + +import click +from tqdm import trange +import yaml +import datetime +import numpy as np +from scipy import stats + +from pluribus.games.short_deck.state import ShortDeckPokerState, new_game +from pluribus.poker.card import Card + + +def _calculate_strategy( + state: ShortDeckPokerState, + I: str, + strategy: DefaultDict[str, DefaultDict[str, float]], + count=None, + total_count=None +) -> str: + sigma = collections.defaultdict( + lambda: collections.defaultdict(lambda: 1 / 3) + ) + try: + # If strategy is empty, go to other block + sigma[I] = strategy[I].copy() + if sigma[I] == {}: + raise KeyError + norm = sum(sigma[I].values()) + for a in sigma[I].keys(): + sigma[I][a] /= norm + a = np.random.choice( + list(sigma[I].keys()), 1, p=list(sigma[I].values()), + )[0] + except KeyError: + if count is not None: + count += 1 + p = 1 / len(state.legal_actions) + probabilities = np.full(len(state.legal_actions), p) + a = np.random.choice(state.legal_actions, p=probabilities) + sigma[I] = {action: p for action in state.legal_actions} + if total_count is not None: + total_count += 1 + return a, count, total_count + + +def _create_dir(folder_id: str) -> Path: + """Create and get a unique dir path to save to using a timestamp.""" + time = str(datetime.datetime.now()) + for char in ":- .": + time = time.replace(char, "_") + path: Path = Path(f"./{folder_id}_results_{time}") + path.mkdir(parents=True, exist_ok=True) + return path + + +def agent_test( + hero_strategy_path: str, + opponent_strategy_path: str, + real_time_est: bool = False, + action_sequence: List[str] = None, + public_cards: List[Card] = [], + n_outer_iters: int = 30, + n_inner_iters: int = 100, + n_players: int = 3, + hero_count=None, + hero_total_count=None, +): + config: Dict[str, int] = {**locals()} + save_path: Path = _create_dir('bt') + with open(save_path / "config.yaml", "w") as steam: + yaml.dump(config, steam) + + # Load unnormalized strategy for hero + hero_strategy = joblib.load(hero_strategy_path) + # Load unnormalized strategy for opponents + opponent_strategy = joblib.load(opponent_strategy_path) + + # Loading game state we used RTS on + if real_time_est: + state: ShortDeckPokerState = new_game( + n_players, real_time_test=real_time_est, public_cards=public_cards + ) + current_game_state: ShortDeckPokerState = state.load_game_state( + opponent_strategy, action_sequence + ) + + # TODO: Right now, this can only be used for loading states if the two + # strategies are averaged. Even averaging strategies is risky. Loading a + # game state should be used with caution. It will work only if the + # probability of reach is identical across strategies. Use the average + # strategy. + + info_set_lut = {} + EVs = np.array([]) + for _ in trange(1, n_outer_iters): + EV = np.array([]) # Expected value for player 0 (hero) + for t in trange(1, n_inner_iters + 1, desc="train iter"): + for p_i in range(n_players): + if real_time_est: + # Deal hole cards based on bayesian updating of hole card + # probabilities + state: ShortDeckPokerState = current_game_state.deal_bayes() + else: + state: ShortDeckPokerState = new_game( + n_players, + info_set_lut + ) + info_set_lut = state.info_set_lut + while True: + player_not_in_hand = not state.players[p_i].is_active + if state.is_terminal or player_not_in_hand: + EV = np.append(EV, state.payout[p_i]) + break + if state.player_i == p_i: + random_action, hero_count, hero_total_count = \ + _calculate_strategy( + state, + state.info_set, + hero_strategy, + count=hero_count, + total_count=hero_total_count + ) + else: + random_action, oc, otc = _calculate_strategy( + state, + state.info_set, + opponent_strategy, + ) + state = state.apply_action(random_action) + EVs = np.append(EVs, EV.mean()) + t_stat = (EVs.mean() - 0) / (EVs.std() / np.sqrt(n_outer_iters)) + p_val = stats.t.sf(np.abs(t_stat), n_outer_iters - 1) + results_dict = { + 'Expected Value': float(EVs.mean()), + 'T Statistic': float(t_stat), + 'P Value': float(p_val), + 'Standard Deviation': float(EVs.std()), + 'N': int(len(EVs)), + 'Random Moves Hero': hero_count, + 'Total Moves Hero': hero_total_count + } + with open(save_path / 'results.yaml', "w") as stream: + yaml.safe_dump(results_dict, stream=stream, default_flow_style=False) + + +if __name__ == "__main__": + strat_path = "test_strategy2/unnormalized_output/" + agent_test( + hero_strategy_path=strat_path + "random_strategy.gz", + opponent_strategy_path=strat_path + "offline_strategy_1500.gz", + real_time_est=False, + public_cards=[], + action_sequence=None, + n_inner_iters=25, + n_outer_iters=75, + hero_count=0, + hero_total_count=0 + ) diff --git a/research/stat_tests/average_unnormalized_strategy.py b/research/stat_tests/average_unnormalized_strategy.py new file mode 100644 index 00000000..e35965ad --- /dev/null +++ b/research/stat_tests/average_unnormalized_strategy.py @@ -0,0 +1,91 @@ +import collections +import glob +import os +import re +from typing import Dict, List, Union + +import click +import joblib +from tqdm import tqdm + + +def calculate_strategy(this_info_sets_regret: Dict[str, float]) -> Dict[str, float]: + """Calculate the strategy based on the current information sets regret.""" + actions = this_info_sets_regret.keys() + regret_sum = sum([max(regret, 0) for regret in this_info_sets_regret.values()]) + if regret_sum > 0: + strategy: Dict[str, float] = { + action: max(this_info_sets_regret[action], 0) / regret_sum + for action in actions + } + elif this_info_sets_regret == {}: + # Don't return strategy if no strategy was made + # during training + strategy: Dict[str, float] = {} + elif regret_sum == 0: + # Regret is negative, we learned something + default_probability = 1 / len(actions) + strategy: Dict[str, float] = {action: default_probability for action in actions} + return strategy + + +def try_to_int(text: str) -> Union[str, int]: + """Attempt to return int.""" + return int(text) if text.isdigit() else text + + +def natural_key(text): + """Sort with natural numbers.""" + return [try_to_int(c) for c in re.split(r"(\d+)", text)] + + +def average_strategy(all_file_paths: List[str]) -> Dict[str, Dict[str, float]]: + """Compute the mean strategy over all timesteps.""" + # The offline strategy for all information sets. + offline_strategy: Dict[str, Dict[str, float]] = collections.defaultdict( + lambda: collections.defaultdict(lambda: 0.0) + ) + # Sum up all strategies. + for dump_path in tqdm(all_file_paths, desc="loading dumps"): + # Load file. + try: + agent = joblib.load(dump_path) + except Exception as e: + tqdm.write(f"Failed to load file at {dump_path} because:{e}") + agent = {} + regret = agent.get("regret", {}) + # Sum probabilities from computed strategy.. + for info_set, this_info_sets_regret in sorted(regret.items()): + strategy = calculate_strategy(this_info_sets_regret) + # If strategy == {}, we do nothing + for action, probability in strategy.items(): + offline_strategy[info_set][action] += probability + # Return regular dict, not defaultdict. + return {info_set: dict(strategy) for info_set, strategy in offline_strategy.items()} + + +@click.command() +@click.option( + "--results_dir_path", default=".", help="the location of the agent file dumps." +) +@click.option( + "--write_dir_path", default=".", help="where to save the offline strategy" +) +def cli(results_dir_path: str, write_dir_path: str): + """Compute the strategy and write to file.""" + # Find all files to load. + all_file_paths = glob.glob(os.path.join(results_dir_path, "*.gz")) + if not all_file_paths: + raise ValueError(f"No agent dumps could be found at: {results_dir_path}") + # Sort the file paths in the order they were created. + all_file_paths = sorted(all_file_paths, key=natural_key) + offline_strategy = average_strategy(all_file_paths) + # Save dictionary to compressed file. + latest_file = os.path.basename(all_file_paths[-1]) + latest_iteration: int = int(re.findall(r"\d+", latest_file)[0]) + save_file: str = f"offline_strategy_{latest_iteration}.gz" + joblib.dump(offline_strategy, os.path.join(write_dir_path, save_file)) + + +if __name__ == "__main__": + cli() diff --git a/research/stat_tests/rts_ab_test.py b/research/stat_tests/rts_ab_test.py new file mode 100644 index 00000000..047d571d --- /dev/null +++ b/research/stat_tests/rts_ab_test.py @@ -0,0 +1,112 @@ +import numpy as np +import json +import joblib +import sys +from typing import List + +import click + +from agent_test import agent_test +from pluribus.poker.deck import Deck +sys.path.append('research/rts') +from RT_cfr import rts + + +@click.command() +@click.option("--offline_strategy_path", help=".") +@click.option("--last_regret_path", help=".") +@click.option("--n_iterations", default=1500, help=".") +@click.option("--lcfr_threshold", default=400, help=".") +@click.option("--discount_interval", default=400, help=".") +@click.option("--n_players", default=3, help=".") +@click.option("--update_interval", default=400, help=".") +@click.option("--update_threshold", default=400, help=".") +@click.option("--dump_int", default=20, help=".") +@click.option("--save_dir", help=".") +@click.option("--n_inner_iters", default=25, help=".") +@click.option("--n_outer_iters", default=150, help=".") +def rts_ab_test( + offline_strategy_path: str, + last_regret_path: str, + n_iterations: int, + lcfr_threshold: int, + discount_interval: int, + n_players: int, + update_interval: int, + update_threshold: int, + dump_int: int, + save_dir: str, + n_inner_iters: int, + n_outer_iters: int, + ranks: List[int] = list(range(10, 14 + 1)), +): + check = joblib.load(offline_strategy_path) + histories = np.random.choice(list(check.keys()), 2) + action_sequences = [] + public_cards_lst = [] + community_card_dict = { + "pre_flop": 0, + "flop": 3, + "turn": 4, + "river": 5, + } + deck = Deck(include_ranks=ranks) + for history in histories: + history_dict = json.loads(history) + history_lst = history_dict['history'] + action_sequence = [] + betting_rounds = [] + for x in history_lst: + action_sequence += list(x.values())[0] + betting_rounds += list(x.keys()) + action_sequences.append(action_sequence) + if action_sequences: + final_betting_round = list(betting_rounds)[-1] + else: + final_betting_round = "pre_flop" + n_cards = community_card_dict[final_betting_round] + cards_in_deck = deck._cards_in_deck + public_cards = list( + np.random.choice(cards_in_deck, n_cards) + ) + public_cards_lst.append(public_cards) + + for i in range(0, len(action_sequences)): + public_cards = public_cards_lst[i].copy() + action_sequence = action_sequences[i].copy() + agent_output, offline_strategy = rts( + offline_strategy_path, + last_regret_path, + public_cards, + action_sequence, + n_iterations=n_iterations, + lcfr_threshold=lcfr_threshold, + discount_interval=discount_interval, + n_players=n_players, + update_interval=update_interval, + update_threshold=update_threshold, + dump_int=dump_int + ) + last_regret = { + info_set: dict(strategy) + for info_set, strategy in agent_output.regret.items() + } + joblib.dump(offline_strategy, save_dir + f'rts_output{i}.gz', compress="gzip") + joblib.dump(last_regret, save_dir + f'last_regret{i}.gz', compress="gzip") + + public_cards = public_cards_lst[i].copy() + action_sequence = action_sequences[i].copy() + agent_test( + hero_strategy_path=save_dir + f"rts_output{i}.gz", + opponent_strategy_path=offline_strategy_path, + real_time_est=True, + public_cards=public_cards, + action_sequence=action_sequence, + n_inner_iters=n_inner_iters, + n_outer_iters=n_outer_iters, + hero_count=0, + hero_total_count=0, + ) + +if __name__ == "__main__": + rts_ab_test() diff --git a/research/test_methodology/validating_nash_equilibriums_via_simulations.ipynb b/research/test_methodology/validating_nash_equilibriums_via_simulations.ipynb deleted file mode 100644 index c146c559..00000000 --- a/research/test_methodology/validating_nash_equilibriums_via_simulations.ipynb +++ /dev/null @@ -1,224 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Validating Nash Equilibriums Via Simulations" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "_by Colin Manko_" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In an effort to validate and test possible improvements to core poker artificial intelligence algorithms, I have designed the following methodology." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Goals" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Validate that MCCFR offline learning strategy is approximating a Nash equilibrium\n", - "- More generally, create a methodology that allows for rigorously testing changes made to the core AI algorithms" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Prerequisites" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Need two test bot implementation strategies ($\\beta{1}$ and $\\beta{2}$) that we would like to compare\n", - "- Need a human tester ($H_{0}$) as a quasi control. The human tester should not have access to any underlying strategies from the test bots or simulated Nash" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Step 1: Randomly Generate Test Game" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Given a set of $N$ game tree nodes (this is the entire game tree, as given by infoset, $I$), randomly generate $x$ test nodes without preplacement and with equal probability. Call the set of test nodes $U$.\n", - "\n", - "As a side note, we will account for probability of reach ($p(h)$) in another step. Equal probability across nodes allows us to find patterns across nodes where our agent underperforms. We will adjust the expected value at $I$, ($v^{\\sigma}(I)$), by $p(h)$.\n", - "- **How to**: For Limit Texas Hold'em, the number of action sequences ($N$), is small enough that they can be found computationally rather than analytically. We can run *all_action_sequences.py* in the *size_of_problem* directory to generate this list. \n", - "- _Something like 15-20 hours and less than 4GB??_\n", - "- Generate $x$ integers to be indices and select them from the *all_action_sequences.py* output\n", - "- Once $x$ action sequences are generated, randomly generate $x$ public card combos, based on the betting stage of the test node, $u$, as well as one pair of private hole cards to be used by $\\beta{1}$, $\\beta{2}$ and $H_0$. They will only get that hand at $u$. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Step 2: Prepare Realtime Search for Finding Nash Equilibrium" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For each test node, $u$, in $U$, use realtime search to compute the Nash Equilibrium ($\\sigma^*$) by constraining the search algorithm to start at $u$, where $u$ is equivalent to $I$ in regard to action sequence, but does not have any set hand for the traversing player ($p_i$).\n", - "\n", - "Use a pooled strategy between $\\beta{1}$ and $\\beta{2}$ to estimate $p(h)$ without bias:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For hand in possible combinations of real hands:\n", - "
\n", - "    For idx, $a$ in action sequence at $u$:\n", - "
\n", - "        if idx == 0:\n", - "
\n", - "             $p(h)_\\beta{1}$ = $\\beta{1}$[$I$][$a$]\n", - "
\n", - "             $p(h)_\\beta{2}$ = $\\beta{2}$[$I$][$a$]\n", - "
\n", - "        $p(h)_\\beta{1}$ *= $\\beta{1}$[$I$][$a$]\n", - "
\n", - "        $p(h)_\\beta{2}$ *= $\\beta{1}$[$I$][$a$]\n", - "
\n", - "    p(h)[rs] = ($p(h)_\\beta{1}$ + $p(h)_\\beta{2}$)/2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The root node of the realtime search algorithm is replaced with a chance node that represents each possible node in the public state $G$ [[Brown, Sandholm, Amos]](https://papers.nips.cc/paper/7993-depth-limited-solving-for-imperfect-information-games.pdf). From the above psuedo-code, this deal can be generated as: " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Generate deal order\n", - "
\n", - "For $i$ in $P_i$ deal order:\n", - "
\n", - "    Generate hand for player based on normalized $p(h)[rs]$ if available, else try again" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "_The \"if available, else try again\" part could be made better_" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Other important features of the _\"Nash Bot\"_ real time search..**:\n", - "- The _\"Nash bot\"_ is the master of this node. In order to reach full convergence, from the normal MCCFR algorithm, we must remove the sampling of actions for opponents.\n", - "- For ease, the real time search should not use leaf nodes, but should search to the end of the game tree, where either a terminal node or a shown down is entered. In this way, we can get a truer sense of the expected value." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A Nash Equilibrium is found if the change in strategy on each iteration drops below some threshold $t$ for the real hand we are testing for. Charting probabilities for each action in $u$ over time for the randomly generated real hand to test should show a convergence over time." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "_One main benefit of using this real time search to validate CFR is this search will need to be developed anyway._" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Step 3: Test and Measure Success" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**For the test bots:**\n", - "For each $u$ in $U$, play each test strategy ($\\beta{1}$ and $\\beta{2}$) against the _\"Nash bot\"_ for $r$ number of simulations. The _\"Nash bot\"_ should be dealt available hands from the distribution of probabilities as determined by $p(h)[rs]$ in the pseudo-code above. Both the test bots and the human tester will be dealt the same hand in each simulation of game play on $u$, as randomly generated in step 1. \n", - "\n", - "If $\\beta{1}$ or $\\beta{2}$ has converged to a Nash equilibrium, then we should expect $v^\\sigma$ to be equal to 0 for our test bot, assuming that _\"Nash bot\"_ has converged to a Nash equilibrium itself. $v^{\\sigma^*}(u)$ and $v^{\\sigma}(u)$ are the estimated payouts for the _\"Nash bot\"_ opponents and the \"hero\" (test bots or human), respectively.\n", - "\n", - "**For the human tester:**\n", - "We can simply create a contrived game. Based on the normalized probability of reach for $u$, $\\bar{p(h)}$, we can randomly generate which $u$ the human player is entered into, however they will always have the same hand upon entering $u$ and their opponents hands will vary based on $p(h)[rs]$.\n", - "\n", - "The test metric is as follows, after $p(h)$ has been normalized for space $U$, $\\bar{p(h)}$:\n", - "$$\\sum_{i=1}^{x}(v^{\\sigma}(u_i)-v^{\\sigma^*}(u_{-i}))\\times{\\bar{p(h)}}$$" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The value closest to 0 (summing no testing agent goes over 0) will have best approximated the Nash equilibrium. Additionaly, $H_0$ can be used as a quasi-control, to validate that the bot is beating a human.\n", - "\n", - "The above metric also has some degree of simulation error. For each simulation in $r$ simulations, we create a distribution of values that has a standard deviation and follows the normal distribution. \n", - "\n", - "Along with calculating the expected payout per simulation, $u^{\\sigma}(u_i)-u^{\\sigma^*}(u_{-i})$, we can also calculate $\\sigma$ for this distribution in order to describe a confidence interval around the test metric. \n", - "\n", - "Finally, a simple difference of means can be done between each test bot to decipher a winner and if that winner had a statistically significant edge. We can then study each $u$ in $U$ to find patterns in which nodes the espspective bots did not do well with." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/research/to_do.md b/research/to_do.md deleted file mode 100644 index e510af90..00000000 --- a/research/to_do.md +++ /dev/null @@ -1,49 +0,0 @@ -A Place for Next Steps in Short Deck Implementation - -## Abstraction - -#### Information Abstraction -- hard code opening hand clusters -- decide how to store these for lookup in blueprint/real time algo -- run for short deck - -#### Action Abstraction -- not sure how this fits into blueprint/real time yet - -## Blueprint Algo -- apply to contrived short deck game - -## Real Time Search Algo -- need isomorphic/lossless handling of cards?? # Non-essential maybe.. -- mock up "toy" version - - pre-req: stateful version of short deck - -### Rules of Contrived Short Deck Game -- 3 players -- 2-9 removed -- no adjustments to hand rankings versus no-limit -- 10000 in stack, 50 small blind, 100 big blind -- limited betting - -#### Possible Next Steps -- fix short deck game and roll out to online hosting? -- go right on to full game? - -#### Current (Concise) Papers -- Abstraction - - https://www.cs.cmu.edu/~sandholm/hierarchical.aamas15.pdf <- this algo - - http://www.ifaamas.org/Proceedings/aamas2013/docs/p271.pdf <- these features -- Blueprint - - https://science.sciencemag.org/content/sci/suppl/2019/07/10/science.aay2400.DC1/aay2400-Brown-SM.pdf <- pseudo code -- Real Time Algo - - https://papers.nips.cc/paper/7993-depth-limited-solving-for-imperfect-information-games.pdf <- build off this - - make theses changes: - - [optimized vector-based linear cfr?](https://arxiv.org/pdf/1809.04040.pdf) - - [only samples chance events?](http://martin.zinkevich.org/publications/ijcai2011_rgbr.pdf) - -#### TODO: Colin -- Generate abstraction for 20 cards --- Program to turn that into dictionary and store separately -- Hard code preflop lossless -- Write next steps in docstring of blueprint algo -- Consider getting rid of notebooks before merging into develop.. \ No newline at end of file diff --git a/research/size_of_problem/action_sequences.pkl b/test/data/action_sequences.pkl similarity index 100% rename from research/size_of_problem/action_sequences.pkl rename to test/data/action_sequences.pkl diff --git a/test/data/random_action_sequences.pkl b/test/data/random_action_sequences.pkl new file mode 100644 index 00000000..41ab5a88 Binary files /dev/null and b/test/data/random_action_sequences.pkl differ diff --git a/test/data/random_offline_strategy.gz b/test/data/random_offline_strategy.gz new file mode 100644 index 00000000..ff4cfe7f Binary files /dev/null and b/test/data/random_offline_strategy.gz differ diff --git a/test/functional/test_short_deck.py b/test/functional/test_short_deck.py index 889bc437..276082ba 100644 --- a/test/functional/test_short_deck.py +++ b/test/functional/test_short_deck.py @@ -1,17 +1,21 @@ import collections +import json import copy import random from typing import List, Tuple, Optional +import joblib import pytest import numpy as np import dill as pickle -from pluribus.games.short_deck.state import ShortDeckPokerState +from pluribus.games.short_deck.state import ShortDeckPokerState, new_game, \ + InfoSetLookupTable from pluribus.games.short_deck.player import ShortDeckPokerPlayer from pluribus.poker.card import Card from pluribus.poker.pot import Pot from pluribus.utils.random import seed +from pluribus.poker.deck import Deck def _new_game( @@ -35,10 +39,10 @@ def _new_game( return state, pot -def _load_action_sequences(directory): +def _load_pkl_file(directory): with open(directory, "rb") as file: - action_sequences = pickle.load(file) - return action_sequences + pkl_file = pickle.load(file) + return pkl_file def test_short_deck_1(): @@ -203,18 +207,17 @@ def _get_flop(state: ShortDeckPokerState) -> List[Card]: @pytest.mark.parametrize("n_players", [2, 3]) -def test_call_action_sequence(n_players): +def test_call_action_sequence(n_players, n: int = 50): """ - Make sure we never see an action sequence of "raise", "call", "call" in the same - round with only two players. There would be a similar analog for more than two players, - but this should aid in initially finding the bug. + Make sure we never see an action sequence of "raise", "call", "call" when + down to two players """ # Seed the random number generation so things are procedural/reproducable. seed(42) - # example of a bad sequence in a two-handed game in one round + # Example of a bad sequence in a two-handed game in one round bad_seq = ["raise", "call", "call"] # Run some number of random iterations. - for _ in range(200): + for _ in range(n): state, _ = _new_game(n_players=n_players, small_blind=50, big_blind=100) betting_round_dict = collections.defaultdict(list) while state.betting_stage not in {"show_down", "terminal"}: @@ -231,22 +234,22 @@ def test_call_action_sequence(n_players): # Loop through the action history and make sure the bad # sequence has not happened. for i in range(len(no_fold_action_history)): - history_slice = no_fold_action_history[i : i + len(bad_seq)] + history_slice = no_fold_action_history[i: i + len(bad_seq)] assert history_slice != bad_seq state = state.apply_action(random_action) @pytest.mark.parametrize("n_players", [2, 3]) -def test_action_sequence(n_players: int): - """ - Check each round against validated action sequences to ensure the state class is - working correctly. - """ +def test_action_sequence( + n_players: int, + n: int = 50, + action_sequences_path: str = "test/data/action_sequences.pkl" +): + """Ensure action sequences are legal.. """ # Seed the random number generation so things are procedural/reproducable. seed(42) - directory = "research/size_of_problem/action_sequences.pkl" - action_sequences = _load_action_sequences(directory) - for i in range(200): + action_sequences = _load_pkl_file(action_sequences_path) + for i in range(n): state, _ = _new_game(n_players=n_players, small_blind=50, big_blind=100) betting_stage_dict = { @@ -281,14 +284,14 @@ def test_action_sequence(n_players: int): assert action_sequence in possible_sequences -def test_skips(n_players: int = 3): +def test_skips(n_players: int = 3, n: int = 50): """ - Check each round to make sure that skips are mod number of players and appended on - the skipped player's turn + Check each round to make sure that skips are mod number of players and + appended on the skipped player's turn """ # Seed the random number generation so things are procedural/reproducable. seed(42) - for _ in range(500): + for _ in range(n): state, _ = _new_game(n_players=n_players, small_blind=50, big_blind=100) while True: @@ -338,3 +341,137 @@ def test_skips(n_players: int = 3): for i, action in enumerate(actions[fold_idx:]): if i % n_players == 0: assert action == "skip" + + +def test_load_game_state( + n_players: int = 3, + n: int = 5, + random_actions_path: str = "test/data/random_action_sequences.pkl" +): + # Load a random sample of action sequences + action_sequences = _load_pkl_file(random_actions_path) + test_action_sequences = np.random.choice(action_sequences, n) + # Lookup table that defaults to 0 as the cluster id + # TODO: Not sure how to quiet the mypy typing complaint.. + info_set_lut: InfoSetLookupTable = { + "pre_flop": collections.defaultdict(lambda: 0), + "flop": collections.defaultdict(lambda: 0), + "turn": collections.defaultdict(lambda: 0), + "river": collections.defaultdict(lambda: 0), + } + state: ShortDeckPokerState = new_game( + n_players, + info_set_lut=info_set_lut, + real_time_test=True, + public_cards=[] + ) + for action_sequence in test_action_sequences: + game_action_sequence = action_sequence.copy() + # Load current game state + current_game_state: ShortDeckPokerState = state.load_game_state( + offline_strategy={}, action_sequence=game_action_sequence + ) + current_history = current_game_state._history + check_action_seq_current = [] + for betting_stage in current_history.keys(): + check_action_seq_current += current_history[betting_stage] + check_action_sequence = [a for a in check_action_seq_current if a != "skip"] + assert check_action_sequence == action_sequence[:-1] + + new_state = current_game_state.deal_bayes() + full_history = new_state._history + check_action_seq_full = [] + for betting_stage in full_history.keys(): + check_action_seq_full += full_history[betting_stage] + check_action_sequence = [a for a in check_action_seq_full if a != "skip"] + assert check_action_sequence == action_sequence + + +def test_public_cards( + n_players: int = 3, + n: int = 5, + strategy_path: str = "test/data/random_offline_strategy.gz" +): + strategy = joblib.load(strategy_path) + histories = np.random.choice(list(strategy.keys()), n) + action_sequences = [] + public_cards_lst = [] + final_betting_round_lst: List[str] = [] + community_card_dict = { + "pre_flop": 0, + "flop": 3, + "turn": 4, + "river": 5, + } + ranks = list(range(10, 14 + 1)) + deck = Deck(include_ranks=ranks) + for history in histories: + history_dict = json.loads(history) + history_lst = history_dict["history"] + action_sequence = [] + betting_rounds = [] + for x in history_lst: + action_sequence += list(x.values())[0] + betting_rounds += list(x.keys()) + if not action_sequence: + continue + action_sequences.append(action_sequence) + final_betting_round = list(betting_rounds)[-1] + final_betting_round_lst.append(final_betting_round) + n_cards = community_card_dict[final_betting_round] + cards_in_deck = deck._cards_in_deck + public_cards = list( + np.random.choice(cards_in_deck, n_cards, replace=False) + ) + public_cards_lst.append(public_cards) + + # TODO: Not sure how to quiet mypy here for typing complaint.. + info_set_lut: InfoSetLookupTable = { + "pre_flop": collections.defaultdict(lambda: 0), + "flop": collections.defaultdict(lambda: 0), + "turn": collections.defaultdict(lambda: 0), + "river": collections.defaultdict(lambda: 0), + } + for i in range(0, len(action_sequences)): + public_cards = public_cards_lst[i].copy() + final_betting_round = final_betting_round_lst[i] + if not public_cards and final_betting_round == "pre_flop": + continue + action_sequence = action_sequences[i].copy() + state: ShortDeckPokerState = new_game( + n_players, + info_set_lut=info_set_lut, + real_time_test=True, + public_cards=public_cards, + ) + current_game_state: ShortDeckPokerState = state.load_game_state( + offline_strategy={}, action_sequence=action_sequence + ) + new_state = current_game_state.deal_bayes() + + cont = True + if len(public_cards) == 0: + loaded_betting_stage = "pre_flop" + elif len(public_cards) == 3: + loaded_betting_stage = "flop" + elif len(public_cards) == 4: + loaded_betting_stage = "turn" + elif len(public_cards) == 5: + loaded_betting_stage = "river" + + public_info = new_state._public_information + for betting_stage in public_info.keys(): + if betting_stage == "pre_flop": + # No cards in the pre_flop stage.. + continue + if cont: + card_len = community_card_dict[betting_stage] + assert public_cards[:card_len] == public_info[betting_stage] + if betting_stage == loaded_betting_stage: + cont = False + else: + # Should only get here if we hit the last action_sequence of + # a round.. + state_public_card_len = len(new_state.community_cards) + public_card_len = len(public_cards) + assert state_public_card_len == public_card_len + 1 diff --git a/test/regression/check_bayes.py b/test/regression/check_bayes.py new file mode 100644 index 00000000..d42faf5f --- /dev/null +++ b/test/regression/check_bayes.py @@ -0,0 +1,157 @@ +import joblib +import collections +import json +from typing import DefaultDict + +import numpy as np +from tqdm import trange + +from pluribus.poker.deck import Deck +from pluribus.games.short_deck.state import ShortDeckPokerState, new_game + + +def _calculate_strategy( + state: ShortDeckPokerState, + I: str, + strategy: DefaultDict[str, DefaultDict[str, float]], +) -> str: + sigma = collections.defaultdict(lambda: collections.defaultdict(lambda: 1 / 3)) + try: + # If strategy is empty, go to other block + sigma[I] = strategy[I].copy() + if sigma[I] == {}: + raise KeyError + norm = sum(sigma[I].values()) + for a in sigma[I].keys(): + sigma[I][a] /= norm + a = np.random.choice( + list(sigma[I].keys()), 1, p=list(sigma[I].values()), + )[0] + except KeyError: + p = 1 / len(state.legal_actions) + probabilities = np.full(len(state.legal_actions), p) + a = np.random.choice(state.legal_actions, p=probabilities) + sigma[I] = {action: p for action in state.legal_actions} + return a + + +n = 10000 +n_players = 3 +inner_iters = 1000 + +strategy_dir = "research/test_methodology/test_strategy2/" +strategy_path = "unnormalized_output/offline_strategy_1500.gz" +check = joblib.load(strategy_dir + strategy_path) +histories = np.random.choice(list(check.keys()), n) +action_sequences = [] +public_cards_lst = [] +community_card_dict = { + "pre_flop": 0, + "flop": 3, + "turn": 4, + "river": 5, +} +# Shorter deck for more reasonable simulation time.. +ranks = list(range(12, 14 + 1)) +deck = Deck(include_ranks=ranks) +found = 0 +for idx, history in enumerate(histories): + if idx % 100 == 0: + print(idx) + history_dict = json.loads(history) + history_lst = history_dict["history"] + if history_lst == []: + continue + action_sequence = [] + betting_rounds = [] + for x in history_lst: + action_sequence += list(x.values())[0] + betting_rounds += list(x.keys()) + try: + final_betting_round = list(betting_rounds)[-1] + except: + import ipdb; + ipdb.set_trace() + # Hacking this for now, keeping the simulation small.. + if len(action_sequence) > 2: + continue + action_sequences.append(action_sequence) + n_cards = community_card_dict[final_betting_round] + cards_in_deck = deck._cards_in_deck + public_cards = np.random.choice(cards_in_deck, n_cards, replace=False) + public_cards_lst.append(list(public_cards)) + found += 1 + if found == 2: + break + # Assuming we find 2 action sequences a=out of 1000 + +store_hand_probs = {} +for i in trange(0, len(action_sequences)): + public_cards = public_cards_lst[i].copy() + # will need to check for this bug later.. +# if not public_cards: +# import ipdb; +# ipdb.set_trace() + action_sequence = action_sequences[i].copy() + state: ShortDeckPokerState = new_game( + n_players, + real_time_test=True, + public_cards=public_cards, + ) + current_game_state: ShortDeckPokerState = state.load_game_state( + offline_strategy=check, action_sequence=action_sequence + ) + new_state = current_game_state.deal_bayes() + + this_hand_probs = new_state._starting_hand_probs.copy() + for p_i in this_hand_probs.keys(): + for starting_hand in this_hand_probs[p_i].keys(): + x = this_hand_probs[p_i][starting_hand] + this_hand_probs[p_i][starting_hand] = {'deal_bayes':x, 'sim':None} + + action_sequence = action_sequences[i].copy() + public_cards = public_cards_lst[i].copy() + info_set_lut = {} + cont = True + actions = [] + tries = 0 + success = 0 + hand_dict = {0: {}, 1: {}, 2: {}} + while cont: + state: ShortDeckPokerState = new_game( + n_players, + info_set_lut, + real_time_test=True, + public_cards=public_cards + ) + info_set_lut = state.info_set_lut + while True: + count = 0 + if tries == 1000: # definitely a hack need to be careful about this + # value + for p_i in state.players: + hole_cards = tuple(state.players[p_i].cards) + try: + hand_dict[p_i][hole_cards] += 0 + except KeyError: + hand_dict[p_i][hole_cards] = 0 + random_action = _calculate_strategy(state, state.info_set, check) + if random_action != action_sequence[count]: + tries += 1 + break + new_state = state.apply_action(random_action) + actions.append(random_action) + if actions == action_sequence: + for p_i in state.players: + hole_cards = tuple(state.players[p_i].cards) + try: + hand_dict[p_i][hole_cards] += 1 + except KeyError: + hand_dict[p_i][hole_cards] = 1 + success += 1 + break + count += 1 + if success == 1: + break + import ipdb; + ipdb.set_trace()