-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvalue_iteration HA 7.py
75 lines (61 loc) · 2.17 KB
/
value_iteration HA 7.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import numpy as np
def expected_state_action_reward(env, s, a, V):
"""
Computes the expected reward of taking action a in state s, according to the given environment.
I.e. compute:
r(s, a) + sum_{s'} p(s'|s, a) * V(s')
:param env: an MDP environment
:param s: a state
:param a: an action
:param V: a value function
:return: the expected reward of taking action a in state s, according to the given environment
"""
return env.R[s, a] + np.sum([env.P[s, a, s_] * V[s_] for s_ in range(env.nS)])
def value_to_policy(V, env):
"""
Computes the gain and bias of a policy based on a value function.
:param V: a value function
:param env: an MDP environment
:return: an optimal policy based on V
"""
policy = np.zeros(env.nS, dtype=int)
for s in range(env.nS):
policy[s] = np.argmax([expected_state_action_reward(env, s, a, V) for a in range(env.nA)])
return policy
def span_semi_norm(f):
"""
Computes the span of a semi-norm.
:param f: a function f: X -> R^S
:return: the span of f
"""
return np.max(f) - np.min(f)
def value_iteration_average_reward(env, epsilon=10 ** -6):
"""
Performs value iteration on the given environment, for an average reward objective.
:param env: an MDP environment
:param epsilon: the precision required for the value function
:return:
V: the value function
policy: an optimal policy based on V
gain: the gain of 'policy'
bias: an associated bias function
n_iter = the number of iterations required to converge
"""
# Initialization
V0 = np.zeros(env.nS)
V1 = np.zeros(env.nS)
n_iter = 0
# Iterate until convergence
while True:
n_iter += 1
V0 = V1.copy()
for s in range(env.nS):
V1[s] = np.max([expected_state_action_reward(env, s, a, V0) for a in range(env.nA)])
if span_semi_norm(V1 - V0) < epsilon:
break
# Compute policy, gain and bias
V = V1 # Final value function
policy = value_to_policy(V, env)
gain = (np.max(V1 - V0) + np.min(V1 - V0)) / 2
bias = V - np.min(V)
return V, policy, gain, bias, n_iter