Skip to content

unconverge #2

@TianrongChen

Description

@TianrongChen

Hi! I'm learning DDP method recently and also upvoted your brilliant implementation. It seems like you are using the MPC version of ilqr? I change it into normal version but it does not converged any more.... Could you please exaplain a little bit for me.
But anyway thanks for your implementation!!
The following is my code which is modified from your code!

import gym
import env
from autograd import grad, jacobian
import autograd.numpy as np

class ILqr:
def init(self, next_state, running_cost, final_cost,
umax, state_dim, pred_time=500):
self.pred_time = pred_time
self.umax = umax
self.v = [0.0 for _ in range(pred_time + 1)]
self.v_x = [np.zeros(state_dim) for _ in range(pred_time + 1)]
self.v_xx = [np.zeros((state_dim, state_dim)) for _ in range(pred_time + 1)]
self.f = next_state
self.lf = final_cost
self.lf_x = grad(self.lf)
self.lf_xx = jacobian(self.lf_x)
self.l_x = grad(running_cost, 0)
self.l_u = grad(running_cost, 1)
self.l_xx = jacobian(self.l_x, 0)
self.l_uu = jacobian(self.l_u, 1)
self.l_ux = jacobian(self.l_u, 0)
self.f_x = jacobian(self.f, 0)
self.f_u = jacobian(self.f, 1)

def backward(self, x_seq, u_seq):
    self.v[-1] = self.lf(x_seq[-1])
    self.v_x[-1] = self.lf_x(x_seq[-1])
    self.v_xx[-1] = self.lf_xx(x_seq[-1])
    k_seq = []
    kk_seq = []
    for t in range(self.pred_time - 1, -1, -1):
        f_x_t = self.f_x(x_seq[t], u_seq[t])
        f_u_t = self.f_u(x_seq[t], u_seq[t])
        q_x = self.l_x(x_seq[t], u_seq[t]) + np.matmul(f_x_t.T, self.v_x[t + 1])
        q_u = self.l_u(x_seq[t], u_seq[t]) + np.matmul(f_u_t.T, self.v_x[t + 1])
        q_xx = self.l_xx(x_seq[t], u_seq[t]) + \
          np.matmul(np.matmul(f_x_t.T, self.v_xx[t + 1]), f_x_t) 
        tmp = np.matmul(f_u_t.T, self.v_xx[t + 1])
        q_uu = self.l_uu(x_seq[t], u_seq[t]) + np.matmul(tmp, f_u_t) 
        q_ux = self.l_ux(x_seq[t], u_seq[t]) + np.matmul(tmp, f_x_t) 
        inv_q_uu = np.linalg.inv(q_uu)
        k = -np.matmul(inv_q_uu, q_u)
        kk = -np.matmul(inv_q_uu, q_ux)
        dv = 0.5 * np.matmul(q_u, k)
        self.v[t] += dv
        self.v_x[t] = q_x - np.matmul(np.matmul(q_u, inv_q_uu), q_ux)
        self.v_xx[t] = q_xx + np.matmul(q_ux.T, kk)
        k_seq.append(k)
        kk_seq.append(kk)
    k_seq.reverse()
    kk_seq.reverse()
    return k_seq, kk_seq

def forward(self, x_seq, u_seq, k_seq, kk_seq):
    x_seq_hat = np.array(x_seq)
    u_seq_hat = np.array(u_seq)
    for t in range(len(u_seq)):
        control = k_seq[t] + np.matmul(kk_seq[t], (x_seq_hat[t] - x_seq[t]))
        u_seq_hat[t] = np.clip(u_seq[t] + control, -self.umax, self.umax)


    return  u_seq_hat

env = gym.make('CartPoleContinuous-v0').env
obs = env.reset()
ilqr = ILqr(lambda x, u: env._state_eq(x, u),
lambda x, u: 0.5 * np.sum(np.square(u)),
lambda x: 0.5 * (np.square(1.0 - np.cos(x[2])) + np.square(x[1]) + np.square(x[3])),
env.max_force,
env.observation_space.shape[0])
u_seq = [np.zeros(1) for _ in range(ilqr.pred_time)]
x_seq = [obs.copy()]
for t in range(ilqr.pred_time):
x_seq.append(env._state_eq(x_seq[-1], u_seq[t]))

cnt = 0
while True:
env.reset()

k_seq, kk_seq = ilqr.backward(x_seq, u_seq)
u_seq = ilqr.forward(x_seq, u_seq, k_seq, kk_seq)


cost=0
x_seq = [env.reset()]
for t in range(ilqr.pred_time):
    obs, _, _, _ = env.step(u_seq[t])
    x_seq.append(obs)
    cost=cost+0.5 * np.sum(np.square(u_seq[t]))
x=x_seq[-1]
cost=cost+0.5 * (np.square(1.0 - np.cos(x[2])) + np.square(x[1]) + np.square(x[3]))
cnt += 1
print(cost)
if cost<15:
    break

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions