In [1]:
import torch
dtype = torch.FloatTensor

## Create an artificial dataset.

In [2]:
# Create random input data X: 100 examples, 10 features.
X = 10 * torch.rand(10, 100).type(dtype) + 7

# Create random true model, with parameters w_true and b_true.
w_true = torch.randn(1, 10).type(dtype)
b_true = 3

print("True (unknown) w:", w_true)
print("True (unknown) b:", b_true)

# Compute true labels using true model.
y = w_true.mm(X) + torch.ones(1, 100) * b_true
# Add some small random noise to labels.
y = y + 0.01 * torch.rand(100)
print(X[:,3])
print(y[:,3])

True (unknown) w: tensor([[-0.3182,  0.4796,  1.0055, -1.1044,  0.7169,  0.4626, -0.2007,  0.6683,
         -2.1065, -0.6109]])
True (unknown) b: 3
tensor([ 9.6119, 14.7852,  7.6711, 16.5726,  9.7087, 10.8515, 15.9129, 14.3722,
        12.3718,  8.2367])
tensor([-16.2539])


## Linear regression with gradient descent.

Why do we need so many epochs?

In [7]:
# Create Tensors for weights, initialize with random values.
# Setting requires_grad=True indicates that we want to compute gradients 
# with respect to them during the backward pass.
w = torch.randn(1, 10, dtype = torch.float32, requires_grad = True)
b = torch.zeros(1, 1, dtype = torch.float32, requires_grad=True)

In [8]:
learning_rate1 = 0.0001 # try 0.1
learning_rate2 = 0.1
for epoch in range(30001): # try 100 first
    # Forward pass.
    y_pred = w.mm(X) + b

    # Compute loss.
    loss = (y_pred - y).pow(2).sum() / 100
    if epoch % 1000 == 0:
        print('Epoch', epoch, 'loss = ', loss.data.item())

    # Backward pass.
    loss.backward()

    # Update weights using gradient descent; w1.data and w2.data are Tensors,
    # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
    # Tensors.
    w.data -= learning_rate1 * w.grad
    b.data -= learning_rate2 * b.grad

    # Manually zero the gradients after updating weights
    w.grad.zero_()
    b.grad.zero_()

Epoch 0 loss =  277.6328430175781
Epoch 1000 loss =  2.6932830810546875
Epoch 2000 loss =  0.2348422110080719
Epoch 3000 loss =  0.03983265906572342
Epoch 4000 loss =  0.009964045137166977
Epoch 5000 loss =  0.002917892299592495
Epoch 6000 loss =  0.000901523744687438
Epoch 7000 loss =  0.0002862299152184278
Epoch 8000 loss =  9.472979581914842e-05
Epoch 9000 loss =  3.4689390304265544e-05
Epoch 10000 loss =  1.5835161320865154e-05
Epoch 11000 loss =  9.904512808134314e-06
Epoch 12000 loss =  8.036930921662133e-06
Epoch 13000 loss =  7.423370334436186e-06
Epoch 14000 loss =  7.234950771817239e-06
Epoch 15000 loss =  7.179718977567973e-06
Epoch 16000 loss =  7.165413080656435e-06
Epoch 17000 loss =  7.166114301071502e-06
Epoch 18000 loss =  7.166114301071502e-06
Epoch 19000 loss =  7.166114301071502e-06
Epoch 20000 loss =  7.166114301071502e-06
Epoch 21000 loss =  7.166114301071502e-06
Epoch 22000 loss =  7.166114301071502e-06
Epoch 23000 loss =  7.166114301071502e-06
Epoch 24000 loss =

In [9]:
print(w_true)
print(b_true)

tensor([[-0.3182,  0.4796,  1.0055, -1.1044,  0.7169,  0.4626, -0.2007,  0.6683,
         -2.1065, -0.6109]])
3


In [10]:
print(w.data)
print(b.data)

tensor([[-0.3182,  0.4795,  1.0056, -1.1045,  0.7169,  0.4627, -0.2007,  0.6683,
         -2.1062, -0.6111]])
tensor([[3.0033]])


## Linear regression with Adam (GD with adaptive learning rate).

In [11]:
# Create Tensors for weights, initialize with random values.
# Setting requires_grad=True indicates that we want to compute gradients 
# with respect to them during the backward pass.
w = torch.randn(1, 10, dtype = torch.float32, requires_grad = True)
b = torch.zeros(1, 1, dtype = torch.float32, requires_grad=True)

In [12]:
optimizer = torch.optim.Adam([b, w], lr = 0.001)

for epoch in range(30001):  # try 100 first
    # Forward pass.
    y_pred = w.mm(X) + b

    # Compute loss.
    loss = (y_pred - y).pow(2).sum() / 100
    if epoch % 1000 == 0:
        print('Epoch', epoch, 'loss = ', loss.data.item())

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()

Epoch 0 loss =  9116.30859375
Epoch 1000 loss =  371.3829650878906
Epoch 2000 loss =  158.77098083496094
Epoch 3000 loss =  145.34825134277344
Epoch 4000 loss =  125.56807708740234
Epoch 5000 loss =  99.16988372802734
Epoch 6000 loss =  68.58865356445312
Epoch 7000 loss =  39.447242736816406
Epoch 8000 loss =  17.807918548583984
Epoch 9000 loss =  5.819873809814453
Epoch 10000 loss =  1.203519344329834
Epoch 11000 loss =  0.17176960408687592
Epoch 12000 loss =  0.06929294764995575
Epoch 13000 loss =  0.061091646552085876
Epoch 14000 loss =  0.05306772142648697
Epoch 15000 loss =  0.04247580096125603
Epoch 16000 loss =  0.030242079868912697
Epoch 17000 loss =  0.018403910100460052
Epoch 18000 loss =  0.00908545684069395
Epoch 19000 loss =  0.0033217165619134903
Epoch 20000 loss =  0.0007540110382251441
Epoch 21000 loss =  0.00010015777661465108
Epoch 22000 loss =  1.4116122656560037e-05
Epoch 23000 loss =  7.461414043063996e-06
Epoch 24000 loss =  7.118911071302136e-06
Epoch 25000 loss 

In [13]:
print(w_true)
print(b_true)

tensor([[-0.3182,  0.4796,  1.0055, -1.1044,  0.7169,  0.4626, -0.2007,  0.6683,
         -2.1065, -0.6109]])
3


In [14]:
print(w.data)
print(b.data)

tensor([[-0.3182,  0.4796,  1.0056, -1.1044,  0.7169,  0.4627, -0.2007,  0.6684,
         -2.1063, -0.6111]])
tensor([[3.0010]])


## Linear regression with [torch.nn](https://pytorch.org/docs/stable/nn.html) functions.

In [15]:
# Put examples in rows.
X = X.transpose(0, 1)
y = y.transpose(0, 1)

In [16]:
# Create Linear model, see https://pytorch.org/docs/stable/nn.html#linear.
model = torch.nn.Linear(X.shape[1], 1)

loss_fn = torch.nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

for epoch in range(30001):  # try 100 first
    # Forward pass.
    y_pred = model(X)

    # Compute loss.
    loss = loss_fn(y_pred, y)
    if epoch % 1000 == 0:
        print(epoch, loss.data)

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()

0 tensor(94.8511)
1000 tensor(24.3929)
2000 tensor(5.6260)
3000 tensor(0.9668)
4000 tensor(0.1035)
5000 tensor(0.0326)
6000 tensor(0.0210)
7000 tensor(0.0115)
8000 tensor(0.0048)
9000 tensor(0.0013)
10000 tensor(0.0002)
11000 tensor(1.8128e-05)
12000 tensor(7.5217e-06)
13000 tensor(7.4849e-06)
14000 tensor(7.0811e-06)
15000 tensor(7.0828e-06)
16000 tensor(7.1539e-06)
17000 tensor(7.0813e-06)
18000 tensor(7.1788e-06)
19000 tensor(7.0883e-06)
20000 tensor(7.0814e-06)
21000 tensor(7.2228e-06)
22000 tensor(7.0830e-06)
23000 tensor(7.0804e-06)
24000 tensor(7.0828e-06)
25000 tensor(8.3590e-06)
26000 tensor(7.5717e-06)
27000 tensor(7.0889e-06)
28000 tensor(7.1578e-06)
29000 tensor(7.0899e-06)
30000 tensor(7.4277e-06)


In [17]:
print(w_true)
print(b_true)

tensor([[-0.3182,  0.4796,  1.0055, -1.1044,  0.7169,  0.4626, -0.2007,  0.6683,
         -2.1065, -0.6109]])
3


In [18]:
print(w.data)
print(b.data)

tensor([[-0.3182,  0.4796,  1.0056, -1.1044,  0.7169,  0.4627, -0.2007,  0.6684,
         -2.1063, -0.6111]])
tensor([[3.0010]])


# Experiment with linear regression and multi-layer NNs for the houses dataset.