In [None]:
import torch
import numpy as np

print(torch.__version__)

2.5.1+cu121


# Automatic Differentiation

## Univariate function

In [None]:
x = torch.tensor([1.0], requires_grad = True)
g1 = 3 * x
g2 = x ** 2 + x
h1 = 2 * g1 + 1
h2 = 2 * g1 + g2
loss = (h1 - h2) ** 2

loss.backward()
print(x.grad)

tensor([6.])


In [None]:
x = torch.tensor(1.0, requires_grad = True)
f = 2 * x * x - 3 * x + 4
print(x)
print(f)

tensor(1., requires_grad=True)
tensor(3., grad_fn=<AddBackward0>)


In [None]:
f.backward() # compute the gradient of f

In [None]:
x.grad # with respect to x

tensor(1.)

## Multivariate function

In [None]:
w = torch.tensor([1.0, 2.0])
w.requires_grad_()
print(w)

e = (w[0] + w[1]) ** 2
r = 2 * w[0] - 1
r.retain_grad()
loss = e + r
print(e.requires_grad, r.requires_grad, loss.requires_grad)
print(loss)

loss.backward()
print(w.grad)
print(r.grad)

tensor([1., 2.], requires_grad=True)
True True True
tensor(10., grad_fn=<AddBackward0>)
tensor([8., 6.])
tensor(1.)


# Basics of PyTorch tensors

A [torch.Tensor](https://pytorch.org/docs/stable/tensors.html) is a multi-dimensional matrix containing elements of *a single data type*. They are similar to NumPy's ndarray with added functionality to take advantage of GPU's and auto-differentiation.

In [None]:
a = torch.arange(6) - 2
print(a)
print(type(a), a.type())
print(a.dtype)

tensor([-2, -1,  0,  1,  2,  3])
<class 'torch.Tensor'> torch.LongTensor
torch.int64


In [None]:
a = torch.arange(4, dtype = torch.float32) - 2
print(a.dtype)
a

torch.float32


tensor([-2., -1.,  0.,  1.])

In [None]:
torch.arange(4.0) - 2

tensor([-2., -1.,  0.,  1.])

In [None]:
a.shape

torch.Size([4])

In [None]:
a.type(), type(a.shape)

('torch.FloatTensor', torch.Size)

PyTorch has 12 different data types. Most functions that create tensors have a keyworded argument `dtype =` that can be used to specify a non-default type for the tensor.

https://pytorch.org/docs/stable/tensor_attributes.html#torch.torch.dtype

In [None]:
a.dtype

In [None]:
a.size()

In [None]:
a = torch.arange(6).reshape(2,3) + 1
a

tensor([[1, 2, 3],
        [4, 5, 6]])

In [None]:
flat = a.flatten()
print(flat)
print(a)

tensor([1, 2, 3, 4, 5, 6])
tensor([[1, 2, 3],
        [4, 5, 6]])


In [None]:
flat[0] = -1
print(flat)
print(a)

tensor([-1,  2,  3,  4,  5,  6])
tensor([[-1,  2,  3],
        [ 4,  5,  6]])


In [None]:
b_np = np.arange(12).reshape(3,4)
print(b_np)
b_np.ravel()

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [None]:
b_np.flatten()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

Use `from_numpy()` to create a Tensor from a numpy.ndarray that will share the same memory. The more general `as_tensor()` can create Tensors from any array_like data structure.

https://pytorch.org/docs/stable/generated/torch.from_numpy.html

https://pytorch.org/docs/stable/generated/torch.as_tensor.html#torch.as_tensor

Going from Tensor to numpy.ndarray can be done using `numpy()`, with the same conditions for sharing memory.

**It is important to know when a copy is performed vs. data is shared.**

In [None]:
b_pt = torch.from_numpy(b_np)
b_pt

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])

In [None]:
b_pt[-1, -1] = 12
b_pt, b_np

(tensor([[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 12]]),
 array([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 12]]))

In [None]:
bnp = b_pt.numpy()
bnp

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 12]])

In [None]:
bnp[-1, -1] = 11
bnp, b_pt

(array([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]]),
 tensor([[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]]))

PyTorch is Python => the flexibility of dynamic typing, generic functions, automatic type casting, ...

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
print(sigmoid(b_pt))
type(b_pt)

tensor([[0.5000, 0.7311, 0.8808, 0.9526],
        [0.9820, 0.9933, 0.9975, 0.9991],
        [0.9997, 0.9999, 1.0000, 1.0000]], dtype=torch.float64)


torch.Tensor

In [None]:
def sigmoid(x):
    return 1 / (1 + torch.exp(-x))
bexp = sigmoid(b_pt.type(torch.FloatTensor))
bexp, bexp.dtype

(tensor([[0.5000, 0.7311, 0.8808, 0.9526],
         [0.9820, 0.9933, 0.9975, 0.9991],
         [0.9997, 0.9999, 1.0000, 1.0000]]),
 torch.float32)

When possible, the reshaped tensor will be a *view* of the input tensor, otherwise, it will be a copy.

https://pytorch.org/docs/stable/generated/torch.reshape.html

https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view

In [None]:
b = b_pt.reshape(-1, 3)
b

In [None]:
b[0,0] = -1.0
b, b_pt

In [None]:
bt = b.t()
bt

In [None]:
bt_flat = bt.reshape(-1)
bt_flat

In [None]:
bt_flat[0] = 0
bt_flat, bt

Tensors that are stored on the GPU have a type that starts with `torch.cuda`. For example, tensors of type `torch.FloatTensor` are stored on the CPU, whereas `torch.cuda.FloatTensor` are stored on the GPU.

In [None]:
if torch.cuda.is_available():
    b_gpu = b.cuda()
print(torch.cuda.is_available())
print(b_gpu, b_gpu.type())

False


NameError: name 'b_gpu' is not defined

# Example of computation graph 1

In [None]:
x = torch.tensor(1.0, requires_grad = True)
f = x * x + 3 * x - 1

print(f.requires_grad)
print(f)

In [None]:
f.backward()
x.grad, x.data, x.grad_fn

In [None]:
f.grad, f.data, f.grad_fn

In [None]:
f.is_leaf, x.is_leaf

In [None]:
loss.backward()

# Example of computation graph 2

In [None]:
# x = torch.tensor(2 * torch.ones(10.view(2,5), requires_grad = True)
# Try x = 0.5 * torch.ones(2, 5, requires_grad = True)
x = 0.5 * torch.ones(2, 5)
x.requires_grad_()

print(x)
print(x.is_leaf, x.requires_grad)

In [None]:
y = 2 * x + 1
print(y)
y.is_leaf

In [None]:
y.requires_grad

In [None]:
z = y * y - 2 * x
z

In [None]:
z.requires_grad

In [None]:
out = z.mean()
out

In [None]:
out.requires_grad

In [None]:
out.backward()

print(x.grad)
print(y.grad)

## Use *retain_grad* to store gradients with non-leaf nodes

In [None]:
x = torch.ones(10).view(2,5)
x.requires_grad_(True)
y = x + 1
y.retain_grad()
z = 2 * y * y + x
out = z.mean()
out.backward()

x.grad

In [None]:
y.requires_grad

In [None]:
y.grad

## Set *retain_graph* to be able to reuse the graph after first call to *backward*

In [None]:
x = torch.tensor([1.0], requires_grad = True)
g1 = 3 * x
g2 = x ** 2 + x
h1 = 2 * g1 + 1
h2 = 2 * g1 + g2
loss = (h1 - h2) ** 2

loss.backward()
print(x.grad)

tensor([6.])


In [None]:
x = torch.tensor([1.0], requires_grad = True)
g1 = 3 * x
g2 = x ** 2 + x
h1 = 2 * g1 + 1
h2 = 2 * g1 + g2
loss = (h1 - h2) ** 2

loss.backward(retain_graph = True)
print(x.grad)
loss.backward(retain_graph = True)
print(x.grad)

tensor([6.])
tensor([12.])


In [None]:
# Reset gradients before calling backward()
x.grad.zero_()
loss.backward(retain_graph = True)
print(x.grad)

tensor([6.])


## Use *x.data* to get the underlying tensor of parameter *x*, useful for changing values in-place before calling *backward* on the same computation graph.
### See also this [PyTorch migration guide](https://pytorch.org/blog/pytorch-0_4_0-migration-guide/).

In [None]:
x = torch.tensor([1.0], requires_grad = True)
g1 = 3 * x
g2 = x ** 2 + x
h1 = 2 * g1 + 1
h2 = 2 * g1 + g2
loss = (h1 - h2) ** 2

loss.backward(retain_graph = True)
print(x.grad)

# x[0] = 0.0 will lead to a RuntimeError, not allowed to change the value of a variable needed for gradient computation.
# Use x.data instead, shares the same tensor, but no computation history.
x.data[0] = 0.0
g1 = 3 * x
g2 = x ** 2 + x
h1 = 2 * g1 + 1
h2 = 2 * g1 + g2
loss = (h1 - h2) ** 2

loss.backward(retain_graph = True)
print(x.grad)

# General *autograd* with vector-Jacobian products
### vector $x \rightarrow$ vector $z(x) \rightarrow$ scalar $L(z)$
### $\displaystyle\frac{\delta L}{\delta x} =$ vector $\times$ Jacobian $= \displaystyle\frac{\delta L}{\delta z} \times \frac{\delta z}{\delta x}$

In [None]:
x = 2 * torch.ones(10).view(2,5)
x.requires_grad_(True)
y = x + 1
z = 2 * y * y + x
out = z.mean()
out.backward(retain_graph = True)

print(x.grad)
print(x.grad.type())
print(x.grad.dtype)

In [None]:
# Now compute gradient of z (each element in z) with respect to x.
# Not working because z is not a scalar, requires argument that is
# gradient with respect to z (same shape as z).
x.grad.zero_()
z.backward(retain_graph = True)

In [None]:
# Now compute gradient of z (each element in z) with respect to x.
# Assume gradient wrt z is 1.
z.backward(torch.ones(10).view(2,-1), retain_graph = True)
x.grad

In [None]:
# Now compute gradient of z (each element in z) with respect to x.
# Assume gradient wrt z is 2.
x.grad.zero_()
z.backward(2 * torch.ones(10).view(2,-1), retain_graph = True)
x.grad