Introduction to numpy

NumPy is the fundamental package for scientific computing with Python. It contains among other things:

  • a powerful N-dimensional array object
  • sophisticated (broadcasting) functions
  • tools for integrating C/C++ and Fortran code
  • useful linear algebra, Fourier transform, and random number capabilities

Besides its obvious scientific uses, NumPy can also be used as an efficient multi-dimensional container of generic data. Arbitrary data-types can be defined. This allows NumPy to seamlessly and speedily integrate with a wide variety of databases.

Library documentation: http://www.numpy.org/

1. The base numpy.array object

In [1]:
import numpy as np

# declare a vector using a list as the argument
v = np.array([1., 2, 3, 4])
v
Out[1]:
array([1., 2., 3., 4.])
In [2]:
v.shape
Out[2]:
(4,)
In [3]:
v.ndim
Out[3]:
1
In [4]:
v.dtype
Out[4]:
dtype('float64')
In [5]:
w = np.array([1., 2, 3, 4], dtype='int')
w
Out[5]:
array([1, 2, 3, 4])
In [6]:
w.dtype
Out[6]:
dtype('int64')
In [7]:
a = np.arange(100, dtype=np.uint16)
In [8]:
a
Out[8]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
      dtype=uint16)
In [9]:
-3 * a ** 2
Out[9]:
array([     0,     -3,    -12,    -27,    -48,    -75,   -108,   -147,
         -192,   -243,   -300,   -363,   -432,   -507,   -588,   -675,
         -768,   -867,   -972,  -1083,  -1200,  -1323,  -1452,  -1587,
        -1728,  -1875,  -2028,  -2187,  -2352,  -2523,  -2700,  -2883,
        -3072,  -3267,  -3468,  -3675,  -3888,  -4107,  -4332,  -4563,
        -4800,  -5043,  -5292,  -5547,  -5808,  -6075,  -6348,  -6627,
        -6912,  -7203,  -7500,  -7803,  -8112,  -8427,  -8748,  -9075,
        -9408,  -9747, -10092, -10443, -10800, -11163, -11532, -11907,
       -12288, -12675, -13068, -13467, -13872, -14283, -14700, -15123,
       -15552, -15987, -16428, -16875, -17328, -17787, -18252, -18723,
       -19200, -19683, -20172, -20667, -21168, -21675, -22188, -22707,
       -23232, -23763, -24300, -24843, -25392, -25947, -26508, -27075,
       -27648, -28227, -28812, -29403], dtype=int32)
In [10]:
a
Out[10]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
      dtype=uint16)
In [11]:
b = a + 1
b
Out[11]:
array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100], dtype=uint16)
In [12]:
a is b
Out[12]:
False
In [13]:
a += 1
In [14]:
a
Out[14]:
array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100], dtype=uint16)

Warning

Beware of the dimensions: a 1D array is not the same as a 2D array with 1 column

In [15]:
a1 = np.array([1, 2, 3])
print(a1, a1.shape, a1.ndim)
[1 2 3] (3,) 1
In [16]:
a2 = np.array([[1, 2, 3]]).T
print(a2, a2.shape, a2.ndim)
[[1]
 [2]
 [3]] (3, 1) 2
In [17]:
a2.dot(a1)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-17-4e2276e15f5f> in <module>
----> 1 a2.dot(a1)

ValueError: shapes (3,1) and (3,) not aligned: 1 (dim 1) != 3 (dim 0)
In [18]:
# Declare a 2D array using a nested list as the argument
M = np.array([[1,2], [3,4], [3.14, -9.17]])
M
Out[18]:
array([[ 1.  ,  2.  ],
       [ 3.  ,  4.  ],
       [ 3.14, -9.17]])
In [19]:
M.shape
Out[19]:
(3, 2)
In [20]:
M.size
Out[20]:
6
In [21]:
M.ravel()
Out[21]:
array([ 1.  ,  2.  ,  3.  ,  4.  ,  3.14, -9.17])
In [22]:
M.ndim
Out[22]:
2
In [23]:
# arguments: start, stop, step
x = np.arange(12).reshape(4, 3)
x
Out[23]:
array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])
In [24]:
np.linspace(0, 10, 50)
Out[24]:
array([ 0.        ,  0.20408163,  0.40816327,  0.6122449 ,  0.81632653,
        1.02040816,  1.2244898 ,  1.42857143,  1.63265306,  1.83673469,
        2.04081633,  2.24489796,  2.44897959,  2.65306122,  2.85714286,
        3.06122449,  3.26530612,  3.46938776,  3.67346939,  3.87755102,
        4.08163265,  4.28571429,  4.48979592,  4.69387755,  4.89795918,
        5.10204082,  5.30612245,  5.51020408,  5.71428571,  5.91836735,
        6.12244898,  6.32653061,  6.53061224,  6.73469388,  6.93877551,
        7.14285714,  7.34693878,  7.55102041,  7.75510204,  7.95918367,
        8.16326531,  8.36734694,  8.57142857,  8.7755102 ,  8.97959184,
        9.18367347,  9.3877551 ,  9.59183673,  9.79591837, 10.        ])
In [25]:
np.logspace(0, 10, 10, base=np.e, dtype=np.int32)
Out[25]:
array([    1,     3,     9,    28,    85,   258,   785,  2386,  7250,
       22026], dtype=int32)
In [26]:
%matplotlib inline
import matplotlib.pyplot as plt

# Random standard Gaussian numbers
wn = np.random.randn(1000)
# Cumsum: Brownian motion
bm = wn.cumsum()

plt.plot(bm)
Out[26]:
[<matplotlib.lines.Line2D at 0x7fe38837f8d0>]
In [27]:
np.diag(np.arange(10))
Out[27]:
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 2, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 3, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 4, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 5, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 6, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 7, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 8, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 9]])
In [28]:
zozo = np.zeros((10, 10))
zozo
Out[28]:
array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
In [29]:
zozo.shape
Out[29]:
(10, 10)
In [30]:
print(M)
[[ 1.    2.  ]
 [ 3.    4.  ]
 [ 3.14 -9.17]]
In [31]:
M[1, 1]
Out[31]:
4.0
In [32]:
# assign new value
M[0, 0] = 7
M[1, :] = -2
M
Out[32]:
array([[ 7.  ,  2.  ],
       [-2.  , -2.  ],
       [ 3.14, -9.17]])
In [33]:
# Warning: the next m is a **view** on M. 
# One again, no copies unless you ask for one!
m = M[0, :]
m[:] = 42
M
Out[33]:
array([[42.  , 42.  ],
       [-2.  , -2.  ],
       [ 3.14, -9.17]])

2 Slicing

In [34]:
# slicing works just like with anything else (lists, etc.)
A = np.array([1, 2, 3, 4, 5])
print(A)
print(A[::-1])
print(A[::2])
print(A[:-1])
[1 2 3 4 5]
[5 4 3 2 1]
[1 3 5]
[1 2 3 4]
In [35]:
A = np.array([[n + m * 10 for n in range(5)] for m in range(5)])
A
Out[35]:
array([[ 0,  1,  2,  3,  4],
       [10, 11, 12, 13, 14],
       [20, 21, 22, 23, 24],
       [30, 31, 32, 33, 34],
       [40, 41, 42, 43, 44]])
In [36]:
print(A[1:4])
[[10 11 12 13 14]
 [20 21 22 23 24]
 [30 31 32 33 34]]
In [37]:
print(A[1:4, :])
[[10 11 12 13 14]
 [20 21 22 23 24]
 [30 31 32 33 34]]
In [38]:
A[1]
Out[38]:
array([10, 11, 12, 13, 14])
In [39]:
A[:, 1]
Out[39]:
array([ 1, 11, 21, 31, 41])
In [40]:
A[:, ::-1]
Out[40]:
array([[ 4,  3,  2,  1,  0],
       [14, 13, 12, 11, 10],
       [24, 23, 22, 21, 20],
       [34, 33, 32, 31, 30],
       [44, 43, 42, 41, 40]])
In [41]:
print(A)
[[ 0  1  2  3  4]
 [10 11 12 13 14]
 [20 21 22 23 24]
 [30 31 32 33 34]
 [40 41 42 43 44]]
In [42]:
row_indices = [1, 2, 4]
print(A[row_indices])
[[10 11 12 13 14]
 [20 21 22 23 24]
 [40 41 42 43 44]]
In [43]:
A[:, row_indices]
Out[43]:
array([[ 1,  2,  4],
       [11, 12, 14],
       [21, 22, 24],
       [31, 32, 34],
       [41, 42, 44]])

Another way is through masking with an array of bools

In [44]:
# index masking
B = np.arange(5)
row_mask = np.array([True, False, True, False, False])
print(B)
print(B[row_mask])
[0 1 2 3 4]
[0 2]

3. Copies

Don't forget that python does not make copies unless told to do so (same as with any mutable type)

If you are not careful enough, this typically leads to a lot of errors and to being fired !!

In [45]:
x = np.arange(6)
y = x
x[2] = 123
y
Out[45]:
array([  0,   1, 123,   3,   4,   5])
In [46]:
x is y
Out[46]:
True
In [47]:
# A real copy
y = x.copy()
In [48]:
# Or equivalently (but the one above is better...)
y = np.copy(x)
In [49]:
x[0] = -12
print(x, y)
[-12   1 123   3   4   5] [  0   1 123   3   4   5]
In [50]:
x is y
Out[50]:
False

To put values of x in y (copy values into an existing array) use

In [51]:
y[:] = x

Final warning

In the next line you copy the values of x into an existing array y (of same size...)

In [52]:
y[:] = x

While in the next line, you give another name y to the object with name x

(and actually, you should never, never, never write something like this)

In [53]:
y = x

4. Miscellaneous stuff

Non-numerical values

A numpy array can contain other things than numeric types

In [54]:
arr = np.array(['ou', 'la', 'la', "c'est", 'dur', 'python'])
In [55]:
arr.sum()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-55-8a56dcf4f91a> in <module>
----> 1 arr.sum()

/anaconda3/lib/python3.7/site-packages/numpy/core/_methods.py in _sum(a, axis, dtype, out, keepdims, initial)
     34 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
     35          initial=_NoValue):
---> 36     return umr_sum(a, axis, dtype, out, keepdims, initial)
     37 
     38 def _prod(a, axis=None, dtype=None, out=None, keepdims=False,

TypeError: cannot perform reduce with flexible type
In [56]:
arr.dtype
Out[56]:
dtype('<U6')

A matrix is no 2D array in numpy

We've been using only array or ndarray objects for now

The is another type: the matrix type

In summary: just don't use it (IMO) and stick with arrays

In [57]:
# Matrix VS array objects in numpy
m1 = np.matrix(np.arange(3))
m2 = np.matrix(np.arange(3))
In [58]:
a1 = np.arange(3)
a2 = np.arange(3)
In [59]:
m1 * m2
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-59-8448bc335613> in <module>
----> 1 m1 * m2

/anaconda3/lib/python3.7/site-packages/numpy/matrixlib/defmatrix.py in __mul__(self, other)
    218         if isinstance(other, (N.ndarray, list, tuple)) :
    219             # This promotes 1-D vectors to row vectors
--> 220             return N.dot(self, asmatrix(other))
    221         if isscalar(other) or not hasattr(other, '__rmul__') :
    222             return N.dot(self, other)

ValueError: shapes (1,3) and (1,3) not aligned: 3 (dim 1) != 1 (dim 0)
In [60]:
a1 * a2
Out[60]:
array([0, 1, 4])

Sparse matrices

In [61]:
from scipy.sparse import csc_matrix, csr_matrix, coo_matrix

probs = np.full(fill_value=1/4, shape=(4,))
X = np.random.multinomial(n=2, pvals=probs, size=4)
X
Out[61]:
array([[1, 0, 0, 1],
       [1, 0, 0, 1],
       [0, 1, 1, 0],
       [1, 0, 1, 0]])
In [62]:
X_coo = coo_matrix(X)
print(X_coo)
X_coo
  (0, 0)	1
  (0, 3)	1
  (1, 0)	1
  (1, 3)	1
  (2, 1)	1
  (2, 2)	1
  (3, 0)	1
  (3, 2)	1
Out[62]:
<4x4 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in COOrdinate format>
In [63]:
X_coo.nnz
Out[63]:
8
In [64]:
print(X, end='\n----\n')
print(X_coo.data, end='\n----\n')
print(X_coo.row, end='\n----\n')
print(X_coo.col, end='\n----\n')
[[1 0 0 1]
 [1 0 0 1]
 [0 1 1 0]
 [1 0 1 0]]
----
[1 1 1 1 1 1 1 1]
----
[0 0 1 1 2 2 3 3]
----
[0 3 0 3 1 2 0 2]
----

There is also

  • csr_matrix: sparse rows format
  • csc_matrix: sparse columns format

Sparse rows is often used for machine learning: sparse features vectors

But sparse column format useful as well (e.g. coordinate gradient descent)

Bored of seing too many decimals?

In [65]:
X = np.random.randn(5, 5)
X
Out[65]:
array([[-0.63036028,  1.11021343,  0.99621098,  0.21559518,  0.88376054],
       [-0.25014246, -0.12628093,  0.44240165, -0.27955855, -0.91122102],
       [ 0.31558855, -0.58863629,  0.89410682,  0.75962722, -0.78295019],
       [-1.10266165, -1.01002557, -1.4394586 ,  1.45987116, -0.04861532],
       [-0.87833319,  1.55374192, -1.21265885,  1.43595696,  0.9805238 ]])
In [66]:
# All number displayed by numpy (in the current kernel) are with 3 decimals max
np.set_printoptions(precision=3)
print(X)
np.set_printoptions(precision=8)
[[-0.63   1.11   0.996  0.216  0.884]
 [-0.25  -0.126  0.442 -0.28  -0.911]
 [ 0.316 -0.589  0.894  0.76  -0.783]
 [-1.103 -1.01  -1.439  1.46  -0.049]
 [-0.878  1.554 -1.213  1.436  0.981]]

Not limited to 2D!

numpy arrays can have any number of dimension (hence the name ndarray)

In [67]:
X = np.arange(18).reshape(3, 2, 3)
X
Out[67]:
array([[[ 0,  1,  2],
        [ 3,  4,  5]],

       [[ 6,  7,  8],
        [ 9, 10, 11]],

       [[12, 13, 14],
        [15, 16, 17]]])
In [68]:
X.shape
Out[68]:
(3, 2, 3)
In [69]:
X.ndim
Out[69]:
3

5. Aggregations and statistics

In [70]:
A = np.arange(42).reshape(7, 6)
A
Out[70]:
array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35],
       [36, 37, 38, 39, 40, 41]])
In [71]:
A.sum()
Out[71]:
861
In [72]:
A[:, 3].mean()
Out[72]:
21.0
In [73]:
A.mean(axis=0)
Out[73]:
array([18., 19., 20., 21., 22., 23.])
In [74]:
A.mean(axis=1)
Out[74]:
array([ 2.5,  8.5, 14.5, 20.5, 26.5, 32.5, 38.5])
In [75]:
A[:,3].std(), A[:,3].var()
Out[75]:
(12.0, 144.0)
In [76]:
A[:,3].min(), A[:,3].max()
Out[76]:
(3, 39)
In [77]:
A.cumsum(axis=1)
Out[77]:
array([[  0,   1,   3,   6,  10,  15],
       [  6,  13,  21,  30,  40,  51],
       [ 12,  25,  39,  54,  70,  87],
       [ 18,  37,  57,  78, 100, 123],
       [ 24,  49,  75, 102, 130, 159],
       [ 30,  61,  93, 126, 160, 195],
       [ 36,  73, 111, 150, 190, 231]])
In [78]:
# sum of diagonal
A.trace()
Out[78]:
105

6. Linear Algebra

In [79]:
A = np.arange(30).reshape(6, 5)
v1 = np.arange(0, 5)
v2 = np.arange(5, 10)
In [80]:
v1
Out[80]:
array([0, 1, 2, 3, 4])
In [81]:
v2
Out[81]:
array([5, 6, 7, 8, 9])
In [82]:
v1 * v2
Out[82]:
array([ 0,  6, 14, 24, 36])

Inner products

In [83]:
# Inner product between vectors
print(v1.dot(v2))

# You can use also (but first solution is better)
print(np.dot(v1, v2))
80
80
In [84]:
A, v1
Out[84]:
(array([[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14],
        [15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24],
        [25, 26, 27, 28, 29]]), array([0, 1, 2, 3, 4]))
In [85]:
A.shape, v1.shape
Out[85]:
((6, 5), (5,))
In [86]:
# Matrix-vector inner product
A.dot(v1)
Out[86]:
array([ 30,  80, 130, 180, 230, 280])
In [87]:
# Transpose
A.T
Out[87]:
array([[ 0,  5, 10, 15, 20, 25],
       [ 1,  6, 11, 16, 21, 26],
       [ 2,  7, 12, 17, 22, 27],
       [ 3,  8, 13, 18, 23, 28],
       [ 4,  9, 14, 19, 24, 29]])
In [88]:
print(v1)
# Inline operations (same for *=, /=, -=)
v1 += 2
[0 1 2 3 4]

Linear systems

In [89]:
A = np.array([[1,2,3], [4,5,6], [7,8,9]])
b = np.array([1,2,3])
print(A, b, sep=2 * '\n')
[[1 2 3]
 [4 5 6]
 [7 8 9]]

[1 2 3]
In [90]:
# solve a system of linear equations
x = np.linalg.solve(A, b)
x
Out[90]:
array([-0.23333333,  0.46666667,  0.1       ])
In [91]:
A.dot(x)
Out[91]:
array([1., 2., 3.])

Eigenvalues and eigenvectors

In [92]:
A = np.random.rand(3,3)
B = np.random.rand(3,3)

evals, evecs = np.linalg.eig(A)
evals
Out[92]:
array([ 1.81812997, -0.37929675,  0.19575061])
In [93]:
evecs
Out[93]:
array([[ 0.44296158,  0.59591516,  0.44101265],
       [ 0.56209248,  0.53654506, -0.89735991],
       [ 0.69845335, -0.59749855, -0.01590714]])

Singular value decomposition (SVD)

Decomposes any matrix $A \in \mathbb R^{m \times n}$ as follows: $$ A = U S V^\top $$ where

  • $U$ and $V$ are orthonormal matrices (meaning that $U^\top U = I$ and $V^\top V = I$)
  • $S$ is a diagonal matrix that contains the singular values in decreasing order
In [94]:
print(A)
U, S, V = np.linalg.svd(A)
[[0.39051812 0.08082301 0.84035274]
 [0.33170512 0.34143184 0.97802995]
 [0.89887386 0.42922605 0.90263387]]
In [95]:
U.dot(np.diag(S)).dot(V)
Out[95]:
array([[0.39051812, 0.08082301, 0.84035274],
       [0.33170512, 0.34143184, 0.97802995],
       [0.89887386, 0.42922605, 0.90263387]])
In [96]:
# U and V are indeed orthonormal
np.set_printoptions(precision=2)
print(U.T.dot(U), V.T.dot(V), sep=2 * '\n')
np.set_printoptions(precision=8)
[[ 1.00e+00  1.91e-17 -1.84e-16]
 [ 1.91e-17  1.00e+00  6.52e-17]
 [-1.84e-16  6.52e-17  1.00e+00]]

[[1.00e+00 2.83e-17 1.99e-16]
 [2.83e-17 1.00e+00 4.37e-17]
 [1.99e-16 4.37e-17 1.00e+00]]

Exercice: the racoon SVD

  • Load the racoon face picture using scipy.misc.face()
  • Visualize the picture
  • Write a function which reshapes the picture into a 2D array, and computes the best rank-r approximation of it (the prototype of the function is compute_approx(X, r)
  • Display the different approximations for r between 5 and 100
In [97]:
import numpy as np
from scipy.misc import face
import matplotlib.pyplot as plt
%matplotlib inline

X = face()
plt.imshow(X)
_ = plt.axis('off')
In [98]:
def compute_approx(X: np.ndarray, r: int):
    """Computes the best rank-r approximation of X using SVD.
    We expect X to the 3D array corresponding to a color image, that we 
    reduce to a 2D one to apply SVD (no broadcasting).
    
    Parameters
    ----------
    X : `np.ndarray`, shape=(n_rows, n_cols, 3)
        The input 3D ndarray
    
    r : `int`
        The desired rank
        
    Return
    ------
    output : `np.ndarray`, shape=(n_rows, n_cols, 3)
        The best rank-r approximation of X
    """
    n_rows, n_cols, n_channels = X.shape
    # Reshape X to a 2D array
    X_reshape = X.reshape(n_rows, n_cols * n_channels)
    # Compute SVD
    U, S, V = np.linalg.svd(X_reshape, full_matrices=False)
    # Keep only the top r first singular values
    S[r:] = 0
    # Compute the approximation
    X_reshape_r = U.dot(np.diag(S)).dot(V)
    # Put it between 0 and 255 again and cast to integer type
    return X_reshape_r.clip(min=0, max=255).astype('int')\
        .reshape(n_rows, n_cols, n_channels)
In [99]:
ranks = [100, 70, 50, 30, 10, 5]
n_ranks = len(ranks)
for i, r in enumerate(ranks):
    X_r = compute_approx(X, r)
    # plt.subplot(n_ranks, 1, i + 1)
    plt.figure(figsize=(5, 5))
    plt.imshow(X_r)
    _ = plt.axis('off')
    plt.title('Rank %d approximation of the racoon' % r, fontsize=16)
    plt.tight_layout()