[Data Analysis] Topic 1: NumPy

Authored by Tony Feng

Created on April 30th, 2022

Last Modified on April 30th, 2022

Intro

NumPy is the core library for scientific computing in Python. It provides a high-performance multi-dimensional array object, along with a large collection of high-level mathematical tools for working with these arrays.


Basics of Arrays

Array Initilization

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import numpy as np
# Create a rank 1 array
a = np.array([1, 2, 3])   

# Create a rank 2 array
b = np.array([[1,2,3],[4,5,6]])    

# Create an array of all zeros
c = np.zeros((2,2))   # array([[ 0.  0.][ 0.  0.]])   
d = np.zeros_like(b)  # array([[0, 0, 0], [0, 0, 0]])  

# Create an array of all ones
e = np.ones((1,2))    # array([[ 1.  1.]])

# Create a constant array
f = np.full((2,2), 7) # array([[ 7.  7.][ 7.  7.]])            

# Create a 2x2 identity matrix
g = np.eye(2)         # array([[ 1.  0.][ 0.  1.]])
h = np.identity(2)    # array([[ 1.  0.][ 0.  1.]])

# Create an array filled with random values
i = np.random.random((2,2))  # array([[ 0.91940167  0.08143941][ 0.68744134  0.87236687]])

# Create an empty array and fill it with 1
j = np.empty((3,2))
j.fill(1)             # array([[1., 1.], [1., 1.], [1., 1.]])

Array Properties

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# Get the shape of the array
a = np.array([1, 2, 3])   
b = np.array([[1,2,3],[4,5,6]])    
print(a.shape)      # (3,)
print(b.shape)      # (2, 3)

# Get the number of elements of the array
print(a.size)       # 3
print(b.size)       # 6

# Get the dimension of the data
print(a.ndim)       # 1
print(b.ndim)       # 2

# Get the type of the array
print(type(a))      # <class 'numpy.ndarray'>

# Get the type of the elements
print(a.dtype)      # dtype('int64')

# The type of the elements in ndarray should be the same. 
# Otherwise, int -> float -> string
a_list = [1, 2, '3']
a_arr = np.array(a_list)    # array(['1', '2', '3'])
b_list = [1, 2, 3.0]
b_arr = np.array(b_list)    # array([1., 2., 3.])

Array Indexing and Slicing

Integer Array Indexing

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
a = np.array([[1, 2], [3, 4], [5, 6]])

# Two ways are equivalent below. The shape is (3,).
print(a[[0, 1, 2], [0, 1, 0]])  # [1 4 5]
print(np.array([a[0, 0], a[1, 1], a[2, 0]]))  # [1 4 5]

# Create an array of indices
# Select one element from each row of a using the indices in b
b = np.array([0, 1, 1])
print(a[np.arange(3), b])  # [ 1  4  6]

# Mutate one element from each row of a using the indices in b
a[np.arange(3), b] += 10   # array([[11  2] [ 3 14] [ 5 16]])

Boolean Array Indexing

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
a = np.array([[1,2], [3, 4], [5, 6]])

# We use boolean array indexing to construct a rank 1 array consisting 
# of the elements of a corresponding to the True values of bool_idx.
bool_idx = (a > 2)      # array([[False False] [ True  True] [ True  True]])
a[bool_idx]             # array([3 4 5 6])

# The above operation is equivalent to these.
a[a > 2]                # array([3 4 5 6])
a[np.where(a > 2)]      # array([3 4 5 6])

x = np.array([1, 0, 3, 4])
y = np.array([1, 2, 3, 5])
x == y                  # array([ True,  False,  True, False])
np.logical_and(x, y)    # array([ True, False,  True,  True])
np.logical_or(x, y)     # array([ True, True,  True,  True])

Slicing

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
b = a[:2, 1:3]             # array([[2, 3], [6, 7]])
c = a[:2, 1:3].copy()      # array([[2, 3], [6, 7]])

# A slice of an array is a view into the same data, 
# so modifying it will modify the original array.
print(a[0, 1])      # 2
b[0, 0] = 77        # b[0, 0] is the same piece of data as a[0, 1]
print(a[0, 1])      # 77
c[0, 0] = 88        # c is a new array in a different memory space.
print(a[0, 1])      # 77

# Two ways of accessing the data in the middle row of the array.
# Mixing integer indexing with slices yields an array of lower rank,
# while using only slices yields an array of the same rank as the original array:
row_r1 = a[1, :]    # Rank 1 view of the second row of a
row_r2 = a[1:2, :]  # Rank 2 view of the second row of a
print(row_r1, row_r1.shape)  # [5 6 7 8] (4,)
print(row_r2, row_r2.shape)  # [[5 6 7 8]] (1, 4)

# We can make the same distinction when accessing columns of an array:
col_r1 = a[:, 1]
col_r2 = a[:, 1:2]
print(col_r1, col_r1.shape)  # [ 2  6 10] (3,)
print(col_r2, col_r2.shape)  # [[ 2] [ 6] [10]] (3, 1)

Data Calculation

Basic Operations

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
a = np.array([[1, 2, 3], [4, 5, 6]])

# Summation
s1 = np.sum(a)            # 27
s2 = np.sum(a, axis=0)    # array([5, 7, 9]), shape is (3,)
s3 = np.sum(a, axis=1)    # array([ 6, 15]), shape is (2,)

# Product
s1 = np.prod(a)           # 720
s2 = np.prod(a, axis=0)   # array([4, 10, 18]), shape is (3,)
s3 = np.prod(a, axis=1)   # array([ 6, 120]), shape is (2,)

# Minimum
s1 = a.min()              # 0
s2 = a.min(axis=0)        # array([0, 0, 0]), shape is (3,)
s3 = a.min(axis=1)        # array([0, 0]), shape is (2,)

# The index of the minimum
s1 = a.argmin()           # 1
s2 = a.argmin(axis=0)     # array([1, 2, 3]), shape is (3,)
s3 = a.argmin(axis=1)     # array([1, 4]), shape is (2,)

# Mean
s1 = a.mean()             # 3.5
s2 = a.mean(axis=0)       # array([2.5, 3.5, 4.5]), shape is (3,)
s3 = a.mean(axis=1)       # array([2., 5.]), shape is (2,)

# Standard Deviation
s1 = a.std()              
s2 = a.std(axis=0)        
s3 = a.std(axis=1)        

# Variance
s1 = a.var()              
s2 = a.var(axis=0)        
s3 = a.var(axis=1)   

# Clip
a.clip(2, 4)               # array([[2, 2, 3], [4, 4, 4]]), 
                           # i.e., num -> 2 if num < 2; num -> 4 if num > 4

# Round
a = np.array([1.1, 2.5, 3.8])
a.round(decimals=0)        # array([1., 2., 4.])

Matrix Operation

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
x = np.array([[1,2],[3,4]], dtype=np.float64)
y = np.array([[5,6],[7,8]], dtype=np.float64)

# Elementwise sum; both produce the array
x + y
np.add(x, y)

# Elementwise difference; both produce the array
x - y
np.subtract(x, y)

# Elementwise product; both produce the array
x * y
np.multiply(x, y)

# Elementwise division; both produce the array
x / y
np.divide(x, y)

# Elementwise square root; produces the array
np.sqrt(x)

v = np.array([9,10])
w = np.array([11, 12])

# Inner product of vectors; both produce 219
v.dot(w))
np.dot(v, w)

# Matrix - Vector product; both produce the rank 1 array [29 67]
x.dot(v)
np.dot(x, v)

# Matrix - Matrix product; both produce the rank 2 array
x.dot(y)
np.dot(x, y)

# Transposing
x = np.array([[1,2], [3,4]])
x               # array([[1 2][3 4]])
x.T             # array([[1 3][2 4]])
x.transpose()   # array([[1 3][2 4]])


v = np.array([1,2,3]) # Note that taking the transpose of a rank 1 array does nothing
v       # array([1 2 3])
v.T     # array([1 2 3])

Useful Modules

Sorting

1
2
3
4
5
6
7
a = np.array([[1, 5, 3], [4, 2, 6]])
# Sort
np.sort(a, axis=0)      # array([[1 2 3] [4 5 6]]), axis = 0 by default 
np.sort(a, axis=1)      # array([[1 3 5] [2 4 6]])

# Sort and get index
np.argsort(a)           # array([[0 2 1] [1 0 2]])

Reshaping

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
a = np.arange(6)        # array([0, 1, 2, 3, 4, 5])

# Change the shape
a.shape = 2,3           # array([[0, 1, 2], [3, 4, 5]])

# Dimensionality Manipulation
a = np.arange(10)
a = a[np.newaxis,:]     # shape is (1, 10)
a = a.squeeze()         # shape is (10, ) 
a = a[:, np.newaxis]    # shape is (10, 1)

Concatenation

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.array([[7, 8, 9], [10, 11, 12]])
c = np.concatenate((a, b), axis=0)      # array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
                                        # np.vstack((a, b))
d = np.concatenate((a, b), axis=1)      # array([[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]])
                                        # np.hstack((a, b))

# Flatten
c.flatten()     # array([ 1,  2,  3,  7,  8,  9,  4,  5,  6, 10, 11, 12]), i.e. shape is (12,)
d.flatten()     # array([ 1,  2,  3,  7,  8,  9,  4,  5,  6, 10, 11, 12]), i.e. shape is (12,)

Randomization

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# Create a random matrix
a = np.random.rand(3,4)

# Create a matrix whose range is specified
b = np.random.randint(10, size=(3,2))

# Get an random number
c = np.random.rand()

# Create an 10-elements array in Gaussian distribution, whose mean is 1 and var is 0.1
np.set_printoptions(precision=2)        # Global setup
d = np.random.normal(1, 0.1, 10)

# Shuffle
np.random.shuffle(a)

# Specify a seed so that the random results keep same.
for i in range(0, 5):
    init = np.array([1, 2, 3, 4, 5, 6])
    np.random.seed(100)  

    for j in range(0, 3):
        np.random.shuffle(init) 
        print(init)              

File Operation

1
2
3
4
5
6
7
'''
If the data produced in execution is massive, it could be stored in .npy to avoid repeated use. 
ndarray is much efficient than list in computation.
'''
a = np.array([[1, 2, 3], [4, 5, 6]])
np.save('data.npy', a)
np.load('data.npy')     # array([[1, 2, 3], [4, 5, 6]])

Reference


MIT License
Last updated on May 02, 2022 05:20 EDT
Built with Hugo
Theme Stack designed by Jimmy