import os
# For fun: try without this. Develop a theory that explains your observation.
os.environ["OPENBLAS_NUM_THREADS"] = "1"

import numpy as np

from time import process_time

if 0:
    A = np.random.randn(25, 25)
    B = np.random.randn(25, 800000)
else:
    A = np.random.randn(2048, 2048)
    B = np.random.randn(2048, 2048)

start = process_time()
A@B
elapsed = process_time() - start
print(elapsed)

0.35817869300000005

print(f"{A.size * B.shape[1] * 2/1e9/elapsed} GFlops/s")
print(f"{A.nbytes*3/elapsed/1e9} GB/s")

47.96452027926742 GFlops/s
0.28104211101133253 GB/s

start = process_time()
for i in range(B.shape[1]):
    A @ B[:,i]
elapsed = process_time() - start
print(elapsed)

Benchmarking Matrix-Matrix Multiply (via BLAS)¶

Point of comparison: Repeated matrix-vector multiplication¶