% [pi,V] = valueIteraton(P,R, gamma)
%
% Implementation of Value Iteration, as presented in Sutton and Barto 4.4.
%
% Given the (s x s' x a) matrices P and R and discount factor gamma, use value iteration to find a
% deterministic policy pi and value function V.
%
function [pi,V] = valueIteration(P, R, gamma)

theta = 10e-6;

NS = size(P,2);
NA = size(P,3);

% init
V = zeros(1,NS);

% policy evaluation
delta = theta+1;
while delta >= theta
    delta = 0;
    for s = 1:NS
        for a = 1:NA
            a_val(a) = sum(P(s,:,a) .* (R(s,:,a) + gamma * V));
        end
        newv(s) = max(a_val);
    end
    delta = max(abs(newv-V));
    V = newv;
end

% now find the policy
for s=1:NS
    for a = 1:NA
        a_val(a) = sum(P(s,:,a) .* (R(s,:,a) + gamma * V));
    end
    [val, pi(s)] = max(a_val);
end


