Skip to content
This repository was archived by the owner on Jul 16, 2024. It is now read-only.

WIP: RT Search #86

Open
wants to merge 42 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
4b156dc
adding probability of reach for each player, probably buggy
big-c-note May 11, 2020
3b2cc02
adding method for dealing cards, changing real time flag since these …
big-c-note May 11, 2020
e15c405
adding TODOs
big-c-note May 11, 2020
cb91e0f
fix infoset lookup issue
big-c-note May 11, 2020
9355e2b
simple way of dealing with predetermined public cards
big-c-note May 12, 2020
28e3f58
fixing broken tests
big-c-note May 12, 2020
21da4bb
getting closer to using realtime, but I broke something in the bayes …
big-c-note May 12, 2020
e326629
fix sytax error
big-c-note May 12, 2020
e362924
work around for awkward deck class behavior
big-c-note May 12, 2020
a56ccca
fixing broken test, still a bug in deal_bayes method
big-c-note May 12, 2020
ae6968d
it's doing something
big-c-note May 13, 2020
9fe1775
adding normal deck back and testing, seems like the regret is reasona…
big-c-note May 13, 2020
09815cf
adding somewhat hacky update strategy
big-c-note May 14, 2020
8750ceb
trying with better ph estimation
big-c-note May 16, 2020
1398134
working out a few bugs
big-c-note May 16, 2020
e341ead
rebasing, confirming RT is running before refactor
big-c-note May 18, 2020
682298d
moving get_game_state to ShortDeckPokerState class
big-c-note May 18, 2020
bf3cfad
removing leftover code
big-c-note May 18, 2020
ebf0a9e
moving agent class to its own file
big-c-note May 18, 2020
da97637
forgot to add in last commit, removing agent class
big-c-note May 18, 2020
a982fff
removing agent strategy from the state class
big-c-note May 18, 2020
cd48a73
refactoring useages of the deck class and card class
big-c-note May 19, 2020
2bbb7e9
cleaning up some errors
big-c-note May 20, 2020
965139a
reorganizing methods, info_set_builder takes args, remove unused attr…
big-c-note May 20, 2020
4f237a9
making unnormalized strategy default, calculate strategy based on tem…
big-c-note May 23, 2020
e68efeb
cleaning up code and testing a few configs
big-c-note May 24, 2020
0cd1bba
beginnings of a test script
big-c-note May 24, 2020
f285442
updating offline_strategy on each dump int, wrapping test method into…
big-c-note May 25, 2020
203fb22
sample out put of state class producing same results as develop branc…
big-c-note May 25, 2020
715c4f8
removing sample files
big-c-note May 25, 2020
90ae9c7
testing many different RTS configs
big-c-note May 26, 2020
5cdd94f
bug fix for dealing bayes hole cards before next community card
big-c-note May 27, 2020
a097810
fixing random strategy bug, adding entry script for rng game nodes an…
big-c-note May 28, 2020
e82eb41
fixing pytest build fail
big-c-note May 28, 2020
7b4b777
updating to decent defaults
big-c-note May 31, 2020
4d70d20
regression-style test for loading game history
big-c-note Jun 1, 2020
8d48cf3
using smaller lookup table for pytest
big-c-note Jun 3, 2020
8e05533
regresion-style test for loading public info, won't work until I add …
big-c-note Jun 9, 2020
cc73933
reorganizing some
big-c-note Jun 13, 2020
1eb9a85
removing old files
big-c-note Jun 13, 2020
8aa9c09
shortening tests and test data, will make private data for fuller reg…
big-c-note Jun 13, 2020
c303e70
refactor and found a new bug in the tests
big-c-note Jun 14, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions pluribus/games/short_deck/agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import collections
import joblib


class Agent:
"""Agent class can hold a trained strategy and regret"""
def __init__(self, regret_path=None):
self.strategy = collections.defaultdict(
lambda: collections.defaultdict(lambda: 0)
)
if regret_path:
offline_strategy = joblib.load(regret_path)
self.regret = collections.defaultdict(
lambda: collections.defaultdict(lambda: 0),
offline_strategy['regret']
)
else:
self.regret = collections.defaultdict(
lambda: collections.defaultdict(lambda: 0)
)
self.tmp_regret = collections.defaultdict(
lambda: collections.defaultdict(lambda: 0)
)

def reset_new_regret(self):
"""Remove regret from temporary storage"""
del self.tmp_regret
self.tmp_regret = collections.defaultdict(
lambda: collections.defaultdict(lambda: 0)
)
342 changes: 305 additions & 37 deletions pluribus/games/short_deck/state.py

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion pluribus/poker/card.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ def __eq__(self, other):
def __ne__(self, other):
return int(self) != int(other)

def __hash__(self):
return hash(int(self))

@property
def eval_card(self) -> EvaluationCard:
"""Return an `EvaluationCard` for use in the `Evaluator`."""
Expand Down Expand Up @@ -178,4 +181,3 @@ def from_dict(x: Dict[str, Union[int, str]]):
if set(x) != {"rank", "suit"}:
raise NotImplementedError(f"Unrecognised dict {x}")
return Card(rank=x["rank"], suit=x["suit"])

6 changes: 6 additions & 0 deletions pluribus/poker/deck.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,9 @@ def pick(self, random: bool = True) -> Card:
card: Card = self._cards_in_deck.pop(index)
self._dealt_cards.append(card)
return card

def remove(self, card):
"""Remove a specific card from the deck"""
if card in self._cards_in_deck:
self._cards_in_deck.remove(card)
self._dealt_cards.append(card)
18 changes: 9 additions & 9 deletions research/blueprint_algo/blueprint_short_deck_poker.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def cfr(agent: Agent, state: ShortDeckPokerState, i: int, t: int) -> float:
logging.debug(f"Got EV for {a}: {voa[a]}")
vo += sigma[I][a] * voa[a]
logging.debug(
f"""Added to Node EV for ACTION: {a} INFOSET: {I}
f"""Added to Node EV for ACTION: {a} INFOSET: {I}
STRATEGY: {sigma[I][a]}: {sigma[I][a] * voa[a]}"""
)
logging.debug(f"Updated EV at {I}: {vo}")
Expand Down Expand Up @@ -346,16 +346,16 @@ def _create_dir() -> Path:


@click.command()
@click.option("--strategy_interval", default=2, help=".")
@click.option("--n_iterations", default=10, help=".")
@click.option("--lcfr_threshold", default=80, help=".")
@click.option("--discount_interval", default=1000, help=".")
@click.option("--prune_threshold", default=4000, help=".")
@click.option("--strategy_interval", default=400, help=".")
@click.option("--n_iterations", default=5500, help=".")
@click.option("--lcfr_threshold", default=400, help=".")
@click.option("--discount_interval", default=400, help=".")
@click.option("--prune_threshold", default=400, help=".")
@click.option("--c", default=-20000, help=".")
@click.option("--n_players", default=3, help=".")
@click.option("--print_iteration", default=10, help=".")
@click.option("--dump_iteration", default=10, help=".")
@click.option("--update_threshold", default=0, help=".")
@click.option("--print_iteration", default=100, help=".")
@click.option("--dump_iteration", default=20, help=".")
@click.option("--update_threshold", default=400, help=".")
def train(
strategy_interval: int,
n_iterations: int,
Expand Down
30 changes: 30 additions & 0 deletions research/rts/RT.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from typing import List
import joblib

from RT_cfr import rts
from pluribus.poker.card import Card


if __name__ == "__main__":
# We can set public cards or not
public_cards = [Card("ace", "diamonds"), Card("king", "clubs"),
Card("jack", "spades"), Card("10", "hearts"),
Card("10", "spades")]
# Action sequence must be in old form (one list, includes skips)
action_sequence = ["raise", "raise", "raise", "call", "call",
"raise", "raise", "raise", "call", "call",
"raise", "raise", "raise", "call", "call", "call"]
agent_output, offline_strategy = rts(
'test_strategy2/unnormalized_output/offline_strategy_1500.gz',
'test_strategy2/strategy_1500.gz', public_cards, action_sequence,
1400, 1, 1, 3, 1, 1, 20
)
save_path = "test_strategy2/unnormalized_output/"
last_regret = {
info_set: dict(strategy)
for info_set, strategy in agent_output.regret.items()
}
joblib.dump(offline_strategy, save_path + 'rts_output.gz', compress="gzip")
joblib.dump(last_regret, save_path + 'last_regret.gz', compress="gzip")
import ipdb;
ipdb.set_trace()
203 changes: 203 additions & 0 deletions research/rts/RT_cfr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
from __future__ import annotations

import collections
from typing import Dict, List
import joblib
from pathlib import Path

from tqdm import trange
import numpy as np
import datetime
import yaml

from pluribus import utils
from pluribus.games.short_deck.state import ShortDeckPokerState, new_game
from pluribus.games.short_deck.agent import Agent
from pluribus.poker.card import Card


def normalize_strategy(this_info_sets_regret: Dict[str, float]) -> Dict[str, float]:
"""Calculate the strategy based on the current information sets regret."""
actions = this_info_sets_regret.keys()
regret_sum = sum([max(regret, 0) for regret in this_info_sets_regret.values()])
if regret_sum > 0:
strategy: Dict[str, float] = {
action: max(this_info_sets_regret[action], 0) / regret_sum
for action in actions
}
elif this_info_sets_regret == {}:
# Don't return strategy if no strategy was made
# during training
strategy: Dict[str, float] = {}
elif regret_sum == 0:
# Regret is negative, we learned something
default_probability = 1 / len(actions)
strategy: Dict[str, float] = {action: default_probability for action in actions}
return strategy


def calculate_strategy(
regret: Dict[str, Dict[str, float]],
I: str,
state: ShortDeckPokerState,
) -> Dict[str, Dict[str, float]]:
"""
Calculate strategy based on regret
"""
sigma = collections.defaultdict(lambda: collections.defaultdict(lambda: 1 / 3))
rsum = sum([max(x, 0) for x in regret[I].values()])
for a in state.legal_actions:
if rsum > 0:
sigma[I][a] = max(regret[I][a], 0) / rsum
else:
sigma[I][a] = 1 / len(state.legal_actions)
return sigma


def _create_dir(folder_id: str) -> Path:
"""Create and get a unique dir path to save to using a timestamp."""
time = str(datetime.datetime.now())
for char in ":- .":
time = time.replace(char, "_")
path: Path = Path(f"./{folder_id}_results_{time}")
path.mkdir(parents=True, exist_ok=True)
return path


def cfr(agent: Agent, state: ShortDeckPokerState, i: int, t: int) -> float:
"""
CFR algo with the a temporary regret object for better strategy averaging
"""
ph = state.player_i

player_not_in_hand = not state.players[i].is_active
if state.is_terminal or player_not_in_hand:
return state.payout[i]

elif ph == i:
I = state.info_set
# Move regret over to temporary object and build off that
if agent.tmp_regret[I] == {}:
agent.tmp_regret[I] == agent.regret[I].copy()
sigma = calculate_strategy(agent.tmp_regret, I, state)

vo = 0.0
voa = {}
for a in state.legal_actions:
new_state: ShortDeckPokerState = state.apply_action(a)
voa[a] = cfr(agent, new_state, i, t)
vo += sigma[I][a] * voa[a]

for a in state.legal_actions:
agent.tmp_regret[I][a] += voa[a] - vo

return vo
else:
Iph = state.info_set
# Move regret over to a temporary object and build off that
if agent.tmp_regret[Iph] == {}:
agent.tmp_regret[Iph] == agent.regret[Iph].copy()
sigma = calculate_strategy(agent.tmp_regret, Iph, state)

try:
a = np.random.choice(
list(sigma[Iph].keys()), 1, p=list(sigma[Iph].values()),
)[0]
except KeyError:
p = 1 / len(state.legal_actions)
probabilities = np.full(len(state.legal_actions), p)
a = np.random.choice(state.legal_actions, p=probabilities)
sigma[Iph] = {action: p for action in state.legal_actions}
except:
import ipdb;
ipdb.set_trace()

new_state: ShortDeckPokerState = state.apply_action(a)
return cfr(agent, new_state, i, t)


def rts(
offline_strategy_path: str,
last_regret_path: str,
public_cards: list,
action_sequence: list,
n_iterations: int,
lcfr_threshold: int,
discount_interval: int,
n_players: int,
update_interval: int,
update_threshold: int,
dump_int: int,
):
"""RTS."""
config: Dict[str, int] = {**locals()}
save_path: Path = _create_dir('RTS')
with open(save_path / "config.yaml", "w") as steam:
yaml.dump(config, steam)
# TODO: fix the seed
# utils.random.seed(36)
agent = Agent(regret_path=last_regret_path)
# Load unnormalized strategy to build off
offline_strategy = joblib.load(offline_strategy_path)
state: ShortDeckPokerState = new_game(
3, real_time_test=True, public_cards=public_cards
)
# Load current game state
current_game_state: ShortDeckPokerState = state.load_game_state(
offline_strategy, action_sequence
)
for t in trange(1, n_iterations + 1, desc="train iter"):
for i in range(n_players): # fixed position i
# Deal hole cards based on bayesian updating of hole card probs
state: ShortDeckPokerState = current_game_state.deal_bayes()
cfr(agent, state, i, t)
if t < lcfr_threshold & t % discount_interval == 0:
d = (t / discount_interval) / ((t / discount_interval) + 1)
for I in agent.tmp_regret.keys():
for a in agent.tmp_regret[I].keys():
agent.tmp_regret[I][a] *= d
# Add the unnormalized strategy into the original
# Right now assumes dump_int is a multiple of n_iterations
if t % dump_int == 0:
# Adding the regret back to the regret dict, we'll build off for
# next RTS
for I in agent.tmp_regret.keys():
if agent.tmp_regret != {}:
agent.regret[I] = agent.tmp_regret[I].copy()
for info_set, this_info_sets_regret in sorted(agent.tmp_regret.items()):
# If this_info_sets_regret == {}, we do nothing
strategy = normalize_strategy(this_info_sets_regret)
# Check if info_set exists..
no_info_set = info_set not in offline_strategy
if no_info_set or offline_strategy[info_set] == {}:
offline_strategy[info_set] = {a: 0 for a in strategy.keys()}
for action, probability in strategy.items():
offline_strategy[info_set][action] += probability
agent.reset_new_regret()

return agent, offline_strategy


if __name__ == "__main__":
# We can set public cards or not
public_cards = [Card("ace", "diamonds"), Card("king", "clubs"),
Card("jack", "spades"), Card("10", "hearts"),
Card("10", "spades")]
# Action sequence must be in old form (one list, includes skips)
action_sequence = ["raise", "raise", "raise", "call", "call",
"raise", "raise", "raise", "call", "call",
"raise", "raise", "raise", "call", "call", "call"]
agent_output, offline_strategy = rts(
'test_strategy3/unnormalized_output/offline_strategy_1500.gz',
'test_strategy3/strategy.gz', public_cards, action_sequence,
1400, 1, 1, 3, 1, 1, 20
)
save_path = "test_strategy3/unnormalized_output/"
last_regret = {
info_set: dict(strategy)
for info_set, strategy in agent_output.regret.items()
}
joblib.dump(offline_strategy, save_path + 'rts_output.gz', compress="gzip")
joblib.dump(last_regret, save_path + 'last_regret.gz', compress="gzip")
import ipdb;
ipdb.set_trace()
Loading