
%% Load data
clear all
load('ocr.mat');
addBias = 1;
[data,w] = crfChain_prepareData(X,y,addBias);
dataTest = data;

% Split into training and testing sentences
trainNdx = fold(data.sentences(:,1))~=0;
testNdx = fold(data.sentences(:,1))==0;
data.sentences = data.sentences(trainNdx,:);
dataTest.sentences = dataTest.sentences(testNdx,:);

%% Parameters of Optimization
n = size(data.sentences,1); % Number of training exmaples
p = length(w); % Dimension of parameter vector
nPasses = 20; % Do this many passes through the data
lambda = 1/n; % Regularization of averaged negative log-likelihood
% (regularization of non-averaged negative log-likelihood is n times this value)
funObj = @(w)penalizedL2(w,@crfChain_lossC,n*lambda/2,data); % Training objective
funObjBatch = @(w,b)penalizedL2(w,@crfChain_lossC,length(b)*lambda/2,setfield(data,'sentences',data.sentences(b,:))); % Training objective for batch of sentences
funObjTest = @(w)crfChain_testErrC(w,dataTest); % Testing objective

%% L-BFGS
fprintf('Training with L-BFGS....');
options = [];
options.maxFunEvals = nPasses; % Restrict the number of passes
options.Display = 'none'; % Turn off reporting progress
options.Corr = 10; % Number of parameter/gradient differences to store
w = zeros(p,1);
w = minFunc(funObj,w,options);
SAG4CRF_printTrainTest(w,n,funObj,funObjTest)

%% Hybrid
fprintf('Training with Hybrid....');
options = [];
options.maxIter = nPasses; % Restrict the number of passes
options.corr = 10; % Number of parameter/gradient differences to store
options.verbose = 0; % Turn off reporting progress
options.strongly = 1; % Flag that objective is strongly-convex
w = zeros(p,1);
[w,output] = batchingLBFGS(funObjBatch,w,n,options);
SAG4CRF_printTrainTest(w,n,funObj,funObjTest)

%% Pegasos
fprintf('Training with Pegasos...');
eta = 1e-4; % Tunable step size
iVals = int32(ceil(rand(n*nPasses,1)*n)); % Sequence of examples to look at
stepSizes = eta./(lambda*[1:n*nPasses]); % Sequence of step sizes of eta/(lambda*k)
average = int32(0); % Do not return the average
w = zeros(p,1);
crfChain_SGC(w,data,lambda,stepSizes,iVals,average);
SAG4CRF_printTrainTest(w,n,funObj,funObjTest)

%% SG
fprintf('Training with SG........');
eta = 1e-2; % Tunable step size
iVals = int32(ceil(rand(n*nPasses,1)*n)); % Sequence of examples to look at
stepSizes = eta*ones(n*nPasses,1); % Constant sequence of step sizes
average = int32(0); % Do not return the average
w = zeros(p,1);
crfChain_SGC(w,data,lambda,stepSizes,iVals,average); % Modifies w in place
SAG4CRF_printTrainTest(w,n,funObj,funObjTest)

%% ASG
fprintf('Training with ASG.......');
eta = 1e-2; % Tunable step size
iVals = int32(ceil(rand(n*nPasses,1)*n)); % Sequence of examples to look at
stepSizes = eta*ones(n*nPasses,1); % Constant sequence of step sizes
average = int32(1); % Return the average
w = zeros(p,1);
wAvg = crfChain_SGC(w,data,lambda,stepSizes,iVals,average); % Modifies w in place and returns wAvg
SAG4CRF_printTrainTest(wAvg,n,funObj,funObjTest)

%% AdaGrad
fprintf('Training with AdaGrad...');
eta = 1; % Tunable step size
D = ones(p,1); % Initial diagonal approximation
iVals = int32(ceil(rand(n*nPasses,1)*n)); % Sequence of examples to look at
stepSizes = eta*ones(n*nPasses,1); % Constant sequence of step sizes
w = zeros(p,1);
crfChain_AdaGradC(w,data,lambda,stepSizes,iVals,D);
SAG4CRF_printTrainTest(w,n,funObj,funObjTest)

%% OEG
fprintf('Training with OEG.......');
eta = .5*ones(n,1); % Initial step sizes
% Initialize theta variables and edge marginals
beta = 10; % Initial value of theta variables for parts with correct lables
for i = 1:n
    nNodes = data.sentences(i,2)-data.sentences(i,1)+1;
    thetaNode{i} = zeros(nNodes,data.nStates);
    thetaEdge{i} = zeros(data.nStates,data.nStates,nNodes-1);
    nodeBels{i} = zeros(nNodes,data.nStates);
    edgeBels{i} = zeros(data.nStates,data.nStates,nNodes-1);
    y_s = data.y(data.sentences(i,1):data.sentences(i,2));
    for node = 1:nNodes
        thetaNode{i}(node,y_s(node)) = beta; 
    end
    for node = 1:nNodes-1
        thetaEdge{i}(y_s(node),y_s(node+1),node) = beta;
    end
end
% Get initial value of w
[w,dualObj,bta] = EG_getW_C(thetaNode,thetaEdge,data,lambda,nodeBels,edgeBels);
sumbta = sum(bta);
nrm2 = sum(w.^2);
evals = 0;
while evals <= n*nPasses
    if evals == 0
        iVals = int32(randperm(n)); % On the first pass, use a random permutation
    else
        iVals = int32(ceil(rand(n,1)*n)); % Use random sampling on subsequent passes
    end
    [evals,dualObj,sumbta,nrm2] = OEG_update_C(w,thetaNode,thetaEdge,data,lambda*n,eta,nodeBels,edgeBels,bta,evals,dualObj,iVals,sumbta,nrm2); % Modifies everything in place
end
w = w/(n*lambda);
SAG4CRF_printTrainTest(w,n,funObj,funObjTest)
gap = funObj(w)+dualObj; % This is the duality gap

%% SAG
fprintf('Training with SAG......');
eta = 1e-2; % Tunable step size
NB = zeros(data.nWords,data.nStates); % Old values of node marginals
EB = zeros(data.nStates,data.nStates,n); % Old values of sum of edges marginals
d = zeros(p,1); % Initial sum of gradient approximations
covered = int32(zeros(n,1)); % Examples we have already visited
iVals = int32(ceil(rand(n*nPasses,1)*n)); % Sequence of examples to look at
w = zeros(p,1);
crfChain_SAGC(w,data,lambda,eta,iVals,d,covered,NB,EB); % Modifies everything in place
SAG4CRF_printTrainTest(w,n,funObj,funObjTest)

%% SAG (with line-search)
fprintf('Training with SAG-LS...');
L = 1; % Initial guess of Lipschitz constant
NB = zeros(data.nWords,data.nStates); % Old values of node marginals
EB = zeros(data.nStates,data.nStates,n); % Old values of sum of edges marginals
d = zeros(p,1); % Initial sum of gradient approximations
covered = int32(zeros(n,1)); % Examples we have already visited
w = zeros(p,1);
evals = int32(0);
while evals <= n*nPasses
    iVals = int32(ceil(rand(n,1)*n));
    [subEvals,L] = crfChain_SAGC_LS(w,data,lambda,iVals,d,covered,NB,EB,L); % Modifies everything except 'L' in place, returns number of forward-backward calls
    evals = evals + subEvals;
end
SAG4CRF_printTrainTest(w,n,funObj,funObjTest)

%% SAG (with old non-uniform sampling strategy)
fprintf('Training with SAG-NUS..');
Li = ones(n,1); % Initial guess of each individual Lipschitz constant
Lmax = 1; % Initial guess of maximum Lipschitz constant
NB = zeros(data.nWords,data.nStates); % Old values of node marginals
EB = zeros(data.nStates,data.nStates,n); % Old values of sum of edges marginals
d = zeros(p,1); % Initial sum of gradient approximations
covered = int32(zeros(n,1)); % Examples we have already visited
w = zeros(p,1);
evals = int32(0);
while evals <= n*nPasses
    randVals = rand(n,2); % We need two random values on each iteration
    [subEvals,Lmax] = crfChain_SAGC_Lipschitz(w,data,lambda,randVals,d,covered,NB,EB,Lmax,Li); % Modifies everything except Lmax in place, returns number of forward-backward calls
    evals = evals + subEvals;
end
SAG4CRF_printTrainTest(w,n,funObj,funObjTest)

%% SAG (with new non-uniform sampling but without skipping)
fprintf('Training with SAG-NUSn.');
Li = ones(n,1); % Initial guess of each individual Lipschitz constant
Lmax = 1; % Initial guess of maximum Lipschitz constant
NB = zeros(data.nWords,data.nStates); % Old values of node marginals
EB = zeros(data.nStates,data.nStates,n); % Old values of sum of edges marginals
d = zeros(p,1); % Initial sum of gradient approximations
covered = int32(zeros(n,1)); % Examples we have already visited
w = zeros(p,1);
evals = int32(0);
while evals <= n*nPasses
    randVals = rand(n,2); % We need two random values on each iteration
    [subEvals,Lmax] = crfChain_SAGC_LipschitzSimple(w,data,lambda,randVals,d,covered,NB,EB,Lmax,Li);
    evals = evals + subEvals;
end
SAG4CRF_printTrainTest(w,n,funObj,funObjTest)

%% SAG (with new non-uniform sampling and skipping line-searches)
fprintf('Training with SAG-NUS*.');
Li = ones(n,1); % Initial guess of each individual Lipschitz constant
Lmax = 1; % Initial guess of maximum Lipschitz constant
NB = zeros(data.nWords,data.nStates); % Old values of node marginals
EB = zeros(data.nStates,data.nStates,n); % Old values of sum of edges marginals
d = zeros(p,1); % Initial sum of gradient approximations
covered = int32(zeros(n,1)); % Examples we have already visited
passes = int32(zeros(n,1)); % Number of consecutive times line-search was satisfied
skip = int32(zeros(n,1)); % Number of times to skip line-search for this example
w = zeros(p,1);
evals = int32(0);
while evals <= n*nPasses
    randVals = rand(n,2); % We need two random values on each iteration
    [subEvals,Lmax] = crfChain_SAGC_LipschitzSimpleSkip(w,data,lambda,randVals,d,covered,NB,EB,Lmax,Li,passes,skip);
    evals = evals + subEvals;
end
SAG4CRF_printTrainTest(w,n,funObj,funObjTest)