% pi = policyIteraton(P,R, gamma)
%
% Implementation of Policy Iteration, as presented in Sutton and Barto 4.3.
%
% Given the (s x s' x a) matrices P and R and discount factor gamma, use policy iteration to find a
% deterministic policy pi and value function V.
%
function [pi,V] = policyIteration(P, R, gamma)

theta = 10e-6;

NS = size(P,2);
NA = size(P,3);

% init
V = zeros(1,NS);
pi = ones(1,NS);

policyStable = false;
while policyStable == false
    % policy evaluation
    delta = theta+1;
    while delta >= theta
        delta = 0;
        for s = 1:NS
            newv(s) = sum(P(s,:,pi(s)) .* (R(s,:,pi(s)) + gamma * V));
        end
        delta = max(abs(newv-V));
        V = newv;
    end

    % now do policy improvement
    policyStable = true;
    for s=1:NS
        b = pi(s);
        a_val = zeros(1,NA);
        for a = 1:NA
            a_val(a) = sum(P(s,:,a) .* (R(s,:,a) + gamma * V));
        end
        [val, pi(s)] = max(a_val);
        if b ~= pi(s)
            policyStable = false;
        end
    end
end


