import torch
import numpy as np
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed
from itertools import combinations
from functools import reduce
from time import time
import pickle

def fetchData(n):
    history = 10
    while True:
        u = 0.5 * np.random.uniform(size=(n+history))
        y = np.zeros(shape=(n+history))
        for i in range(history, n+history):
            y[i] = 0.3 * y[i-1] + 0.05 * y[i-1] * np.sum(y[i-history:i]) + 1.5 * u[i-1] * u[i-history] + 0.1
        import warnings
        warnings.filterwarnings("ignore")
        if np.isfinite(y).all():
            return (u[history:], y[history:])
        else:
            continue

def ESN_init(inSize, outSize, resSize, alpha, sparsity):
    Win = np.random.rand(inSize+1, resSize) - 0.5
    W = np.random.rand(resSize, resSize) - 0.5
    W[np.random.rand(resSize, resSize)>sparsity] = 0
    spec_rad = max(abs(np.linalg.eig(W)[0]))
    W /= spec_rad 
    return Win, W

def res(data, Ws, Wins, resSize, device):
    inSize, outSize, alpha, sparsity = 1, 1, 0.7, 0.8
    data_tensor = torch.tensor(data, dtype=torch.float32).unsqueeze(0)
    data_copies = data_tensor.repeat(len(Ws), 1)
    W_copies = torch.stack([torch.tensor(W, dtype=torch.float32, device=device) for W in Ws]).to(device)
    Win_copies = torch.stack([torch.tensor(Win, dtype=torch.float32, device=device) for Win in Wins]).to(device)
    R_copies = 0.1 * (torch.rand((len(Ws), resSize), device=device) - 0.5).to(device)
    chunk_size = 1
    i = 0
    dm = torch.zeros((len(Ws), chunk_size, 1 + inSize + resSize))
    chunk = data_copies[:, chunk_size*i:chunk_size*(i+1)]
    rtn = []
    while chunk.shape[1] != 0:
        dm = dm.to(device)
        chunk = chunk.to(device)
        for t in range(chunk.size(1)):
            u = chunk[:, t].unsqueeze(-1)  # Current input
            ones = torch.ones(u.shape[0], 1, device=device)
            ones = torch.hstack((ones, u))
            R_copies = (1 - alpha)*R_copies +\
            alpha*torch.tanh(torch.einsum('ij,ijk->ik', ones,Win_copies) +\
                      torch.einsum('ij,ijk->ik', R_copies, W_copies)) 

            # copy, time, signal
            dm[:, t, :] = torch.cat((ones, R_copies), dim=1)
        rtn.append(dm.cpu())
        i+=1
        dm = torch.zeros((len(Ws), chunk_size, 1 + inSize + resSize))
        chunk = data_copies[:, chunk_size*i:chunk_size*(i+1)]
    rtn = torch.cat(rtn, dim=1)[:,50:data_copies.size(1),:] # cut out the warmup
    return rtn

def gpu_job(device, res_size, noise, ridge):

    # Utilities.
    def compute_wout():
        chunk_size = 1
        j, b, t, i_k = dms.size()
        num_chunks = (b + chunk_size - 1) // chunk_size     
        a_chunks = []
        for chunk_idx in range(num_chunks):
            start_idx = chunk_idx * chunk_size
            end_idx = min(start_idx + chunk_size, b)
            dms_chunk = dms[:, start_idx:end_idx, :, :].to(device)
            a_chunk = torch.einsum('jbti,jbtk -> bitk', dms_chunk, dms_chunk)
            a_chunk = torch.sum(a_chunk, axis=2) / numshots
            a_chunks.append(a_chunk)
        a = torch.cat(a_chunks, dim=0)
        avg = sum(dms)/len(dms)    
        i_total, j, k = avg.size()
        num_chunks = (i_total + chunk_size - 1) // chunk_size

        result =[]
        for chunk_idx in range(num_chunks):
            start_idx = chunk_idx * chunk_size
            end_idx = min((chunk_idx+1) * chunk_size, i_total)
            avg_chunk = avg[start_idx:end_idx, :, :].to(device)
            result_chunk = torch.einsum('ikj,k->ij', avg_chunk, Y_train[50:])
            result.append(result_chunk) 
        b = torch.cat(result, dim=0)
        ridge_mat = torch.eye(a.shape[1]).to(device)
        Wout = torch.linalg.lstsq(a + ridge*ridge_mat, b, rcond=None)[0]
        return Wout

    def compute_powerset(data):
        to_reduce = [data[:, :, idx] for idx in generate_powerset(range(data.shape[2]))[1:]]
        ones = torch.ones(to_reduce[0][:, :, 0].shape)
        rtn = []
        for r in to_reduce:
            rtn.append(reduce(lambda a, b: a*b, [r[:, :, i] for i in range(r.shape[2])], ones))
            rtn.append(reduce(lambda a, b: ((a*b).T/(torch.max(np.abs(a*b), dim=1)[0])).T, [r[:, :, i] for i in range(r.shape[2])], ones))
        return rtn

    def set_seed(seed):
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)            

    def generate_powerset(s):
            power_set = []
            # Generate all subsets
            for subset_size in range(len(s)+1):  
                for subset in combinations(s, subset_size):
                    power_set.append(list(subset))
            return power_set

    set_seed(0)
    inSize, outSize, resSize, alpha, sparsity = 1, 1, res_size, 0.7, 0.8
    data, Y = fetchData(train_cycles + test_cycles + 100)
    data_train, Y_train = data[:train_cycles+50], Y[:train_cycles+50]
    data_test, Y_test = data[train_cycles+50:], Y[train_cycles+50:]

    # Initialize reservoir networks
    Wins, Ws = [], []
    num_reservoirs = 500 # hard coded, for now.
    for _ in range(num_reservoirs):
        Win, W = ESN_init(inSize, outSize, resSize, alpha, sparsity)
        Wins.append(Win)
        Ws.append(W)

    numshots = 10
    dms = []
    for _ in range(numshots):
        dm = res(data_train, Ws, Wins, resSize, device)  
        # Simple noise model.
        dm += torch.normal(mean=0, size=dm.shape, std=noise)
        rtn = compute_powerset(dm)
        dm = torch.stack(rtn, dim=2)
        dms.append(dm)
    dms = torch.stack(dms)


    test = res(data_test, Ws, Wins, resSize, device)    
    rtn = compute_powerset(test)
    test = torch.stack(rtn, dim=2)
    Y_train = torch.tensor(Y_train).float()
    Y_train = Y_train.to(device)

    Wout = compute_wout()    
    # With the power set, it's too big for the GPU.
    Wout = Wout.to(torch.device('cpu'))
    Yhat = torch.einsum('ijk,ik->ij',test ,Wout).to(device)
    expand_test = torch.tensor(Y_test[50:])[:, None].expand(-1, dm.shape[0]).T.to(device)
    NRMSEs_lstsq = torch.sqrt(torch.mean((expand_test-Yhat)**2, axis=1)/torch.var(expand_test, axis=1))
    rtn = []

    # remove outliers
    for n in NRMSEs_lstsq:
        if n <=2:
            rtn.append(n.cpu())
    rtn = np.mean(rtn)

    return res_size, rtn

from reservoir import plot_NARMA
Wout, Echo, Y_test, data_test = plot_NARMA()

0.5887707279682656

import matplotlib.pyplot as plt
from reservoir import reservoir_slow, fetchData
import numpy as np

np.random.seed(137)
plt.figure(figsize=(6, 12))

index = len(data_test) // 2
RA_Test1 = reservoir_slow(data_test[:index], *Echo, True)
RA_Test2 = reservoir_slow(data_test[index:], *Echo, True)

# Plot for first half of data
plt.subplot(2, 1, 1)
plt.plot(Y_test[:index], color='red',  linewidth=5, label='Target Value')
plt.plot(np.dot(RA_Test1, Wout), color='green', linestyle=":",  linewidth=2,
         label='Test Prediction - First Half')
plt.legend()
plt.ylabel("NARMA(t)")
plt.xlabel("Test Sample (t)")
plt.title("Test Performance")
plt.yscale('log')
plt.ylim(.2,1)

# Plot for second half of data
plt.subplot(2, 1, 2)
plt.plot(Y_test[index:], color='red',  linewidth=5, label='Target Value')
plt.plot(np.dot(RA_Test2, Wout), color='green', linestyle=":",  linewidth=2,
         label='Test Prediction - Second Half')
plt.legend()
plt.ylabel("NARMA(t)")
plt.xlabel("Test Sample (t)")
plt.title("Test Performance")
plt.yscale('log')
plt.ylim(.2,1)
plt.show()

import torch
from functools import reduce
from itertools import combinations

def reservoir(data, Win, Wres, inSize, resSize, alpha, batch_number, power_set):
    # Apple "metal performance shaders".
    device = torch.device("mps")

    # Convert Wres, Win, and data to tensor and assign device
    Wres = torch.tensor(Wres, device=device, dtype=torch.float32)
    Win = torch.tensor(Win, device=device, dtype=torch.float32)
    data = torch.tensor(data, device=device, dtype=torch.float32)

    batch_size = len(data) // batch_number
    # We'll trim some of the data for simplicity, we can fix this later.
    data = data[:batch_number*batch_size]
    new_data = data.view(batch_number, batch_size)

    # Replicate Wres, Win for batch_number times
    Wres_copies = Wres.repeat(batch_number, 1, 1)
    Win_copies = Win.repeat(batch_number, 1, 1)

    R_copies = 0.1 * (torch.ones((batch_number, resSize), device=device) - 0.5)
    dm = torch.zeros((batch_number, batch_size - 50, 1 + inSize + resSize), device=device)

    for t in range(batch_size):
        u = new_data[:, t, None]
        ones = torch.hstack((torch.ones(u.shape[0], 1, device=device), u))
        R_copies = (1 - alpha)*R_copies +\
        alpha*torch.tanh(torch.einsum('ij,ijk->ik', ones, Win_copies) +\
                  torch.einsum('ij,ijk->ik', R_copies, Wres_copies))
        if t >= 50:
            dm[:, t-50, :] = torch.cat((ones, R_copies), dim=1)            
    dm = dm.view(-1, 1+inSize+resSize)
    new_data = new_data[:, 50:].flatten()
    s = list(dm.T)[2:]
    chosen_subsets = []

    if power_set:
        # Generate all subsets
        ones = torch.ones(new_data.shape[0], dtype=torch.float32)
        for subset_size in range(len(s)+1):
            for subset in combinations(s, subset_size):
                chosen_subsets.append([l.cpu().numpy() for l in subset])
        power_signals = [reduce(lambda a, b: a*b, el, ones.numpy()) for el in chosen_subsets] 
        power_signals = [torch.tensor(pw, dtype=torch.float32)
                         for i, pw in enumerate(power_signals)]
        power_signals = [el.cpu() for el in list(dm.T)] + power_signals

    return torch.vstack(power_signals).T, new_data

from reservoir import ESN_init, fetchData, reservoir_slow
import numpy as np

powerset = True
inSize = outSize = 1
resSize = 5
alpha = .7
sparsity = .9
train_cycles = 1000000
Echo = ESN_init(inSize, outSize, resSize, alpha, sparsity)
data_train, Y_train = fetchData(train_cycles)

from time import time
number_of_batches = [20000, 10000, 2000, 1000]
runtimes = []
for batch_number in number_of_batches:
    start = time()
    reservoir(data_train, *Echo, batch_number, powerset)
    stop = time()
    runtimes.append(stop-start)
print(runtimes)

[0.7789459228515625, 0.9311521053314209, 1.4612369537353516, 2.1751229763031006]

from time import time
number_of_batches = [20000, 10000, 2000, 1000]
runtimesserial = []
for batch_number in number_of_batches:
    start = time()
    reservoir_slow(data_train[:-50*batch_number], *Echo, powerset)
    stop = time()
    runtimesserial.append(stop-start)
print(runtimesserial)

[0.00034689903259277344, 9.030734062194824, 16.001220226287842, 16.72932481765747]

speedups = np.array(runtimesserial)/np.array(runtimes)
plt.plot(number_of_batches, speedups)
plt.title("Multiplicative Speedup of Parallel Implementation")
plt.xlabel("Number of batches")
plt.ylabel("Serial time / GPU time");

print(f"The maximum speed up is {np.max(speedups)}!")

The maximum speed up is 10.950462336299404!

from time import time
from reservoir import ESN_init, fetchData, reservoir_slow
import numpy as np

train_cycles = 100000
number_of_batches = [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000]
runtimes = []
for batch_number in number_of_batches:
    Echo = ESN_init(inSize, outSize, resSize, alpha, sparsity)
    data_train, Y_train = fetchData(batch_number * 50 + train_cycles)
    start = time()
    reservoir(data_train, *Echo, batch_number, powerset)
    stop = time()
    runtimes.append(stop-start)
print(runtimes)

[0.38776707649230957, 0.2525200843811035, 0.24639606475830078, 0.17734909057617188, 0.1612389087677002, 0.15805602073669434, 0.1435999870300293, 0.1475820541381836, 0.1337909698486328, 0.14001893997192383]

plt.plot(number_of_batches, runtimes)
plt.xlabel("Number of batches")
plt.ylabel("Total run time (s)")
plt.title("Run time versus number of batches for GPU implementation");

from time import time
Echo = ESN_init(inSize, outSize, resSize, alpha, sparsity)
data_train, Y_train = fetchData(train_cycles)
start = time()
reservoir_slow(data_train, *Echo, powerset)
stop = time()
runtimesserial = stop - start
print(runtimesserial)

1.8172321319580078

plt.plot(number_of_batches, runtimesserial/np.array(runtimes))
plt.title("Multiplicative Speedup of Parallel Implementation")
plt.xlabel("Number of batches")
plt.ylabel("Serial time / GPU time");

Part 2: Simulations¶

Appendix: Alternate Timewise Parallelization Scheme¶

Speeding up simulation¶

Seeing the speed up¶