fedden · big-c-note · May 11, 2020 · May 11, 2020 · May 11, 2020 · May 11, 2020
diff --git a/pluribus/games/short_deck/agent.py b/pluribus/games/short_deck/agent.py
@@ -0,0 +1,30 @@
+import collections
+import joblib
+
+
+class Agent:
+    """Agent class can hold a trained strategy and regret"""
+    def __init__(self, regret_path=None):
+        self.strategy = collections.defaultdict(
+            lambda: collections.defaultdict(lambda: 0)
+        )
+        if regret_path:
+            offline_strategy = joblib.load(regret_path)
+            self.regret = collections.defaultdict(
+                lambda: collections.defaultdict(lambda: 0),
+                offline_strategy['regret']
+            )
+        else:
+            self.regret = collections.defaultdict(
+                lambda: collections.defaultdict(lambda: 0)
+            )
+        self.tmp_regret = collections.defaultdict(
+            lambda: collections.defaultdict(lambda: 0)
+        )
+
+    def reset_new_regret(self):
+        """Remove regret from temporary storage"""
+        del self.tmp_regret
+        self.tmp_regret = collections.defaultdict(
+            lambda: collections.defaultdict(lambda: 0)
+        )
diff --git a/pluribus/games/short_deck/state.py b/pluribus/games/short_deck/state.py
diff --git a/pluribus/poker/card.py b/pluribus/poker/card.py
@@ -74,6 +74,9 @@ def __eq__(self, other):
     def __ne__(self, other):
         return int(self) != int(other)
 
+    def __hash__(self):
+        return hash(int(self))
+
     @property
     def eval_card(self) -> EvaluationCard:
         """Return an `EvaluationCard` for use in the `Evaluator`."""
@@ -178,4 +181,3 @@ def from_dict(x: Dict[str, Union[int, str]]):
         if set(x) != {"rank", "suit"}:
             raise NotImplementedError(f"Unrecognised dict {x}")
         return Card(rank=x["rank"], suit=x["suit"])
-
diff --git a/pluribus/poker/deck.py b/pluribus/poker/deck.py
@@ -61,3 +61,9 @@ def pick(self, random: bool = True) -> Card:
         card: Card = self._cards_in_deck.pop(index)
         self._dealt_cards.append(card)
         return card
+
+    def remove(self, card):
+        """Remove a specific card from the deck"""
+        if card in self._cards_in_deck:
+            self._cards_in_deck.remove(card)
+            self._dealt_cards.append(card)
diff --git a/research/blueprint_algo/blueprint_short_deck_poker.py b/research/blueprint_algo/blueprint_short_deck_poker.py
@@ -204,7 +204,7 @@ def cfr(agent: Agent, state: ShortDeckPokerState, i: int, t: int) -> float:
             logging.debug(f"Got EV for {a}: {voa[a]}")
             vo += sigma[I][a] * voa[a]
             logging.debug(
-                f"""Added to Node EV for ACTION: {a} INFOSET: {I} 
+                f"""Added to Node EV for ACTION: {a} INFOSET: {I}
                 STRATEGY: {sigma[I][a]}: {sigma[I][a] * voa[a]}"""
             )
         logging.debug(f"Updated EV at {I}: {vo}")
@@ -346,16 +346,16 @@ def _create_dir() -> Path:
 
 
 @click.command()
-@click.option("--strategy_interval", default=2, help=".")
-@click.option("--n_iterations", default=10, help=".")
-@click.option("--lcfr_threshold", default=80, help=".")
-@click.option("--discount_interval", default=1000, help=".")
-@click.option("--prune_threshold", default=4000, help=".")
+@click.option("--strategy_interval", default=400, help=".")
+@click.option("--n_iterations", default=5500, help=".")
+@click.option("--lcfr_threshold", default=400, help=".")
+@click.option("--discount_interval", default=400, help=".")
+@click.option("--prune_threshold", default=400, help=".")
 @click.option("--c", default=-20000, help=".")
 @click.option("--n_players", default=3, help=".")
-@click.option("--print_iteration", default=10, help=".")
-@click.option("--dump_iteration", default=10, help=".")
-@click.option("--update_threshold", default=0, help=".")
+@click.option("--print_iteration", default=100, help=".")
+@click.option("--dump_iteration", default=20, help=".")
+@click.option("--update_threshold", default=400, help=".")
 def train(
     strategy_interval: int,
     n_iterations: int,

diff --git a/research/rts/RT.py b/research/rts/RT.py
@@ -0,0 +1,30 @@
+from typing import List
+import joblib
+
+from RT_cfr import rts
+from pluribus.poker.card import Card
+
+
+if __name__ == "__main__":
+    # We can set public cards or not
+    public_cards = [Card("ace", "diamonds"), Card("king", "clubs"),
+                    Card("jack", "spades"), Card("10", "hearts"),
+                    Card("10", "spades")]
+    # Action sequence must be in old form (one list, includes skips)
+    action_sequence = ["raise", "raise", "raise", "call", "call",
+                       "raise", "raise", "raise", "call", "call",
+                       "raise", "raise", "raise", "call", "call", "call"]
+    agent_output, offline_strategy = rts(
+        'test_strategy2/unnormalized_output/offline_strategy_1500.gz',
+        'test_strategy2/strategy_1500.gz', public_cards, action_sequence,
+        1400, 1, 1, 3, 1, 1, 20
+    )
+    save_path = "test_strategy2/unnormalized_output/"
+    last_regret = {
+        info_set: dict(strategy)
+        for info_set, strategy in agent_output.regret.items()
+    }
+    joblib.dump(offline_strategy, save_path + 'rts_output.gz', compress="gzip")
+    joblib.dump(last_regret, save_path + 'last_regret.gz', compress="gzip")
+    import ipdb;
+    ipdb.set_trace()
diff --git a/research/rts/RT_cfr.py b/research/rts/RT_cfr.py
@@ -0,0 +1,203 @@
+from __future__ import annotations
+
+import collections
+from typing import Dict, List
+import joblib
+from pathlib import Path
+
+from tqdm import trange
+import numpy as np
+import datetime
+import yaml
+
+from pluribus import utils
+from pluribus.games.short_deck.state import ShortDeckPokerState, new_game
+from pluribus.games.short_deck.agent import Agent
+from pluribus.poker.card import Card
+
+
+def normalize_strategy(this_info_sets_regret: Dict[str, float]) -> Dict[str, float]:
+    """Calculate the strategy based on the current information sets regret."""
+    actions = this_info_sets_regret.keys()
+    regret_sum = sum([max(regret, 0) for regret in this_info_sets_regret.values()])
+    if regret_sum > 0:
+        strategy: Dict[str, float] = {
+            action: max(this_info_sets_regret[action], 0) / regret_sum
+            for action in actions
+        }
+    elif this_info_sets_regret == {}:
+        # Don't return strategy if no strategy was made
+        # during training
+        strategy: Dict[str, float] = {}
+    elif regret_sum == 0:
+        # Regret is negative, we learned something
+        default_probability = 1 / len(actions)
+        strategy: Dict[str, float] = {action: default_probability for action in actions}
+    return strategy
+
+
+def calculate_strategy(
+    regret: Dict[str, Dict[str, float]],
+    I: str,
+    state: ShortDeckPokerState,
+) -> Dict[str, Dict[str, float]]:
+    """
+    Calculate strategy based on regret
+    """
+    sigma = collections.defaultdict(lambda: collections.defaultdict(lambda: 1 / 3))
+    rsum = sum([max(x, 0) for x in regret[I].values()])
+    for a in state.legal_actions:
+        if rsum > 0:
+            sigma[I][a] = max(regret[I][a], 0) / rsum
+        else:
+            sigma[I][a] = 1 / len(state.legal_actions)
+    return sigma
+
+
+def _create_dir(folder_id: str) -> Path:
+    """Create and get a unique dir path to save to using a timestamp."""
+    time = str(datetime.datetime.now())
+    for char in ":- .":
+        time = time.replace(char, "_")
+    path: Path = Path(f"./{folder_id}_results_{time}")
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+
+
+def cfr(agent: Agent, state: ShortDeckPokerState, i: int, t: int) -> float:
+    """
+    CFR algo with the a temporary regret object for better strategy averaging
+    """
+    ph = state.player_i
+
+    player_not_in_hand = not state.players[i].is_active
+    if state.is_terminal or player_not_in_hand:
+        return state.payout[i]
+
+    elif ph == i:
+        I = state.info_set
+        # Move regret over to temporary object and build off that
+        if agent.tmp_regret[I] == {}:
+            agent.tmp_regret[I] == agent.regret[I].copy()
+        sigma = calculate_strategy(agent.tmp_regret, I, state)
+
+        vo = 0.0
+        voa = {}
+        for a in state.legal_actions:
+            new_state: ShortDeckPokerState = state.apply_action(a)
+            voa[a] = cfr(agent, new_state, i, t)
+            vo += sigma[I][a] * voa[a]
+
+        for a in state.legal_actions:
+            agent.tmp_regret[I][a] += voa[a] - vo
+
+        return vo
+    else:
+        Iph = state.info_set
+        # Move regret over to a temporary object and build off that
+        if agent.tmp_regret[Iph] == {}:
+            agent.tmp_regret[Iph] == agent.regret[Iph].copy()
+        sigma = calculate_strategy(agent.tmp_regret, Iph, state)
+
+        try:
+            a = np.random.choice(
+                list(sigma[Iph].keys()), 1, p=list(sigma[Iph].values()),
+            )[0]
+        except KeyError:
+            p = 1 / len(state.legal_actions)
+            probabilities = np.full(len(state.legal_actions), p)
+            a = np.random.choice(state.legal_actions, p=probabilities)
+            sigma[Iph] = {action: p for action in state.legal_actions}
+        except:
+            import ipdb;
+            ipdb.set_trace()
+
+        new_state: ShortDeckPokerState = state.apply_action(a)
+        return cfr(agent, new_state, i, t)
+
+
+def rts(
+    offline_strategy_path: str,
+    last_regret_path: str,
+    public_cards: list,
+    action_sequence: list,
+    n_iterations: int,
+    lcfr_threshold: int,
+    discount_interval: int,
+    n_players: int,
+    update_interval: int,
+    update_threshold: int,
+    dump_int: int,
+):
+    """RTS."""
+    config: Dict[str, int] = {**locals()}
+    save_path: Path = _create_dir('RTS')
+    with open(save_path / "config.yaml", "w") as steam:
+        yaml.dump(config, steam)
+    # TODO: fix the seed
+    # utils.random.seed(36)
+    agent = Agent(regret_path=last_regret_path)
+    # Load unnormalized strategy to build off
+    offline_strategy = joblib.load(offline_strategy_path)
+    state: ShortDeckPokerState = new_game(
+        3, real_time_test=True, public_cards=public_cards
+    )
+    # Load current game state
+    current_game_state: ShortDeckPokerState = state.load_game_state(
+        offline_strategy, action_sequence
+    )
+    for t in trange(1, n_iterations + 1, desc="train iter"):
+        for i in range(n_players):  # fixed position i
+            # Deal hole cards based on bayesian updating of hole card probs
+            state: ShortDeckPokerState = current_game_state.deal_bayes()
+            cfr(agent, state, i, t)
+        if t < lcfr_threshold & t % discount_interval == 0:
+            d = (t / discount_interval) / ((t / discount_interval) + 1)
+            for I in agent.tmp_regret.keys():
+                for a in agent.tmp_regret[I].keys():
+                    agent.tmp_regret[I][a] *= d
+        # Add the unnormalized strategy into the original
+        # Right now assumes dump_int is a multiple of n_iterations
+        if t % dump_int == 0:
+            # Adding the regret back to the regret dict, we'll build off for
+            # next RTS
+            for I in agent.tmp_regret.keys():
+                if agent.tmp_regret != {}:
+                    agent.regret[I] = agent.tmp_regret[I].copy()
+            for info_set, this_info_sets_regret in sorted(agent.tmp_regret.items()):
+                # If this_info_sets_regret == {}, we do nothing
+                strategy = normalize_strategy(this_info_sets_regret)
+                # Check if info_set exists..
+                no_info_set = info_set not in offline_strategy
+                if no_info_set or offline_strategy[info_set] == {}:
+                    offline_strategy[info_set] = {a: 0 for a in strategy.keys()}
+                for action, probability in strategy.items():
+                    offline_strategy[info_set][action] += probability
+            agent.reset_new_regret()
+
+    return agent, offline_strategy
+
+
+if __name__ == "__main__":
+    # We can set public cards or not
+    public_cards = [Card("ace", "diamonds"), Card("king", "clubs"),
+                    Card("jack", "spades"), Card("10", "hearts"),
+                    Card("10", "spades")]
+    # Action sequence must be in old form (one list, includes skips)
+    action_sequence = ["raise", "raise", "raise", "call", "call",
+                       "raise", "raise", "raise", "call", "call",
+                       "raise", "raise", "raise", "call", "call", "call"]
+    agent_output, offline_strategy = rts(
+        'test_strategy3/unnormalized_output/offline_strategy_1500.gz',
+        'test_strategy3/strategy.gz', public_cards, action_sequence,
+        1400, 1, 1, 3, 1, 1, 20
+    )
+    save_path = "test_strategy3/unnormalized_output/"
+    last_regret = {
+        info_set: dict(strategy)
+        for info_set, strategy in agent_output.regret.items()
+    }
+    joblib.dump(offline_strategy, save_path + 'rts_output.gz', compress="gzip")
+    joblib.dump(last_regret, save_path + 'last_regret.gz', compress="gzip")
+    import ipdb;
+    ipdb.set_trace()