In [None]:
import torch
import numpy as np

print(torch.__version__)

A [torch.Tensor](https://pytorch.org/docs/stable/tensors.html) is a multi-dimensional matrix containing elements of a single data type. They are similar to NumPy's ndarray with added functionality to take advantage of GPU's and auto-differentiation.

In [None]:
a = torch.arange(6)
a

In [None]:
a.shape

In [None]:
a.type()

In [None]:
a.dtype

In [None]:
a.size()

In [None]:
a = torch.arange(6).reshape(2,3)
a

In [None]:
a.flatten()

In [None]:
b_np = np.arange(12).reshape(3,4)
print(b_np)
b_np.ravel()

In [None]:
b_pt = torch.from_numpy(b_np)
b_pt

In [None]:
b_pt.numpy()

In [None]:
b_pt + 1

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
sigmoid(b_pt)

In [None]:
b = b_pt.reshape(-1, 3)
b

In [None]:
b.t()

In [None]:
if torch.cuda.is_available():
  b = b.cuda()
print(torch.cuda.is_available())
print(b)

# Example of computation graph 1

In [None]:
x = torch.tensor(2 * torch.ones(10).view(2,5), requires_grad = True)
print(x)
x.is_leaf

In [None]:
y = x + 1
print(y)
y.is_leaf

In [None]:
y.requires_grad

In [None]:
z = 2 * y * y + x
z

In [None]:
z.requires_grad

In [None]:
out = z.mean()
out

In [None]:
out.requires_grad

In [None]:
out.backward()
print(x.grad)
print(y.grad)

## Use *retain_grad* to store gradients with non-leaf nodes

In [None]:
x = 2 * torch.ones(10).view(2,5)
x.requires_grad_(True)
y = x + 1
y.retain_grad()
z = 2 * y * y + x
out = z.mean()
out.backward()

x.grad

In [None]:
y.requires_grad

In [None]:
y.grad

# Example of computation graph 2

In [None]:
x = torch.tensor([1.0], requires_grad = True)
g1 = 3 * x
g1.requires_grad

In [None]:
x = torch.tensor([1.0], requires_grad = True)
g1 = 3 * x
g2 = x ** 2 + x
h1 = 2 * g1 + 1
h2 = 2 * g1 + g2
loss = (h1 - h2) ** 2
print(loss)

In [None]:
loss.backward()
x.grad, x.data, loss.grad_fn

In [None]:
loss.backward()

## Set *retain_graph* to be able to reuse the graph after first call to *backward* 

In [None]:
x = torch.tensor([1.0], requires_grad = True)
g1 = 3 * x
g2 = x ** 2 + x
h1 = 2 * g1 + 1
h2 = 2 * g1 + g2
loss = (h1 - h2) ** 2

loss.backward(retain_graph = True)
print(x.grad)
loss.backward(retain_graph = True)
print(x.grad)

In [None]:
# Rest gradients before calling backward()
x.grad.zero_()
loss.backward(retain_graph = True)
print(x.grad)

## Use *x.data* to get the underlying tensor of parameter *x*, useful for changing values in-place before calling *backward* on the same computation graph.
### See also this [PyTorch migration guide](https://pytorch.org/blog/pytorch-0_4_0-migration-guide/).

In [None]:
x = torch.tensor([1.0], requires_grad = True)
g1 = 3 * x
g2 = x ** 2 + x
h1 = 2 * g1 + 1
h2 = 2 * g1 + g2
loss = (h1 - h2) ** 2

loss.backward(retain_graph = True)
print(x.grad)

# x[0] = 0.0 will lead to a RuntimeError, not allowed to change the value of a variable needed for gradient computation.
# Use x.data instead, shares the same tensor, but no computation history.
x.data[0] = 0.0
g1 = 3 * x
g2 = x ** 2 + x
h1 = 2 * g1 + 1
h2 = 2 * g1 + g2
loss = (h1 - h2) ** 2

loss.backward(retain_graph = True)
print(x.grad)

# General *autograd* with vector-Jacobian products
### vector $x \rightarrow$ vector $z(x) \rightarrow$ scalar $L(z)$
### $\displaystyle\frac{\delta L}{\delta x} =$ vector $\times$ Jacobian $= \displaystyle\frac{\delta L}{\delta z} \times \frac{\delta z}{\delta x}$

In [None]:
x = 2 * torch.ones(10).view(2,5)
x.requires_grad_(True)
y = x + 1
z = 2 * y * y + x
out = z.mean()
out.backward(retain_graph = True)

print(x.grad)
print(x.grad.type())
print(x.grad.dtype)

In [None]:
# Now compute gradient of z (each element in z) with respect to x.
# Not working because z is not a scalar, requires argument that is
# gradient with respect to z (same shape as z).
x.grad.zero_()
z.backward(retain_graph = True)

In [None]:
# Now compute gradient of z (each element in z) with respect to x.
# Assume gradient wrt z is 1.
z.backward(torch.ones(10).view(2,-1), retain_graph = True)
x.grad

In [None]:
# Now compute gradient of z (each element in z) with respect to x.
# Assume gradient wrt z is 2.
x.grad.zero_()
z.backward(2 * torch.ones(10).view(2,-1), retain_graph = True)
x.grad