diff --git a/.gitignore b/.gitignore
index e44def3..d618499 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,3 @@ build/*
 *.cubin
 *.fatbin
 tests/*_tests
-data/*
diff --git a/src/fhp.cu b/src/fhp.cu
index 98f1860..01e1ee7 100644
--- a/src/fhp.cu
+++ b/src/fhp.cu
@@ -445,3 +445,141 @@ initialize_grid(u8* device_grid, u8* device_obstacle, double* probability,
     return;
 }
 
+__global__
+void
+evolve_non_local(u8* device_grid, curandState_t* randstate, int width, int height, int timesteps, 
+    double* device_channels, double* mx, double *my, double* ocpy, 
+    double* amx, double* amy, double* aoc)
+{
+    __shared__ u8 sdm[default_bh+2][default_bw+2];
+    __shared__ float spx;
+    __shared__ float spy;
+    __shared__ float soc;
+    
+    const auto local_row = threadIdx.y+1;
+    const auto local_col = threadIdx.x+1;
+    const auto row = blockIdx.y * blockDim.y + threadIdx.y;
+    const auto col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= height || col >= width)
+        return;
+    curandState_t localstate = randstate[row*width + col];
+    __syncthreads();
+
+    double px = 0, py = 0;
+    double ocp = 0;
+    __syncthreads();
+
+
+    for (size_t t = 0; t < timesteps; t++)
+    {
+
+        // 1. load into shared memory
+        sdm[local_row][local_col] = device_grid[row*width + col];
+        // As such all these bool values can be stored in the same
+        // register. Is nvcc smart enough to do this if the number
+        // of registers at hand are low?
+        const auto ubound_y {local_row == blockDim.y-1};
+        const auto ubound_x {local_col == blockDim.x-1};
+        const auto lbound_y {local_row == 1};
+        const auto lbound_x {local_col == 1};
+        size_t pad_row = row, pad_col = col;
+        size_t local_pad_row = local_row, local_pad_col = local_col;
+
+        // printf("row %d col %d: lr %d lbx %d\n", row, col, local_row, lbound_x);
+        // NSight is going to roast tf out of this kernel
+        if (lbound_y)
+        {
+            pad_row = (row == 0) ? height-1 : row-1;
+            local_pad_row = 0;
+            sdm[local_pad_row][local_col] = device_grid[pad_row * width + col];
+        } else if (ubound_y)
+        {
+            pad_row = (row+1) % height;
+            local_pad_row = blockDim.y+1;
+            sdm[blockDim.y+1][local_col] = device_grid[pad_row * width + col];
+        }
+        // __syncthreads();
+        if (lbound_x)
+        {
+            pad_col = (col == 0) ? width-1 : col-1;
+            local_pad_col = 0;
+            sdm[local_row][local_pad_col] = device_grid[row * width + pad_col];
+        } else if (ubound_x)
+        {
+            pad_col = (col+1) % width;
+            local_pad_col = blockDim.x+1;
+            sdm[local_row][local_pad_col] = device_grid[row * width + pad_col];
+        }
+        // __syncthreads();
+
+        if (pad_row != row && pad_col != col)
+        {
+            sdm[local_pad_row][local_pad_col] = device_grid[pad_row*width + pad_col]; 
+        }
+        __syncthreads();
+
+        // 2. Streaming
+        u8 state = stream<u8, 6, 8, 8>(local_row, local_col, sdm);
+        // if (row == 0 && col == 2){
+        //     printf("row %d, col %d: state: %d\n", row, col, state);
+        //     printf("%d %d %d\n", sdm[local_row][local_col-1], sdm[local_row+1][local_col], sdm[local_row+1][local_col+1]);
+        //     printf("%d %d %d\n", sdm[local_row][local_col+1], sdm[local_row-1][local_col], sdm[local_row-1][local_col-1]);
+        // }
+
+        // printf("[%d, %d]: state %d\n", row, col, state);
+        __syncthreads();
+        // 3. Collision
+        u8 size = d_eq_class_size[state];
+        u8 base_index = d_state_to_eq[state];
+
+        // This is from [0,...,size-1]
+        float rand = curand_uniform(&localstate);
+        rand *= size-0.00001;
+        u8 random_index = (u8)(rand) % size; // Require curand_init for each thread
+
+        state = d_eq_classes[base_index + random_index];
+
+        device_grid[row*width + col] = state;
+        // printf("row %d, col %d: collide: %d\n", row, col, state);
+
+        // Add to momentum and occupancy matrices
+        px += momentum_x<u8, 6>(state, device_channels);
+        py +=  momentum_y<u8, 6>(state, device_channels);
+        ocp += occupancy<u8, 6>(state);
+
+        atomicAdd(&spx, px);
+        atomicAdd(&spy, py);
+        atomicAdd(&soc, ocp);
+        __syncthreads();
+        if (local_row == 1 && local_col == 1)
+        {
+            spx /= 64;
+            spy /= 64;
+            soc/= 64;
+        }
+    }
+    __syncthreads();
+
+    mx[row*width + col] = px / timesteps;
+    my[row*width + col] = py / timesteps;
+
+    ocpy[row*width+col] = ocp / timesteps;
+
+    if (local_col == 1 && local_row == 1){
+        const auto br = blockIdx.y;
+        const auto bc = blockIdx.x;
+        const auto addr = br*gridDim.x+bc;
+        // if (br==1)
+        // printf("[%d %d]\tbr: %d, bc: %d bdx: %d bdy %d index: %d px: %f\n", 
+        //     row, col, br, bc, gridDim.x, gridDim.x, addr, spx / timesteps);
+
+        amx[addr] = spx / timesteps;
+        amy[addr] = spy / timesteps;
+        aoc[addr] = soc / timesteps;
+    }
+    // if (row == height-1 && col == width-1){
+    //     printf("[%d %d] br: %d, bc: %d px: %f\n", row, col, blockIdx.y, blockIdx.x, spx / timesteps);
+    // }
+return;
+}
+
diff --git a/src/fhp.hpp b/src/fhp.hpp
index 08b5cd2..fc084d6 100644
--- a/src/fhp.hpp
+++ b/src/fhp.hpp
@@ -234,6 +234,19 @@ struct fhp_grid
         }
         stream << "\n";
     }
+    void 
+    create_csv(std::ofstream &stream, double *buf, const int& w, const int& h)
+    {
+        for(int i=0; i<h; i++)
+        {
+            int j;
+            for(j=0; j<w-1; j++)
+                stream << buf[i*w + j] << ", " ;
+            stream << buf[i*w + j];
+            stream << "\n";
+        }
+        stream << "\n";
+    }
 
 };
     
@@ -275,3 +288,9 @@ __global__
 void 
 initialize_grid(u8* device_grid, u8* device_obstacle, double* probability, 
     curandState_t *randstate, int width);
+
+__global__
+void
+evolve_non_local(u8* device_grid, curandState_t* randstate, int width, int height, int timesteps, 
+    double* device_channels, double* mx, double *my, double* ocpy, 
+    double* amx, double* amy, double* aoc);
diff --git a/src/obstacle.hpp b/src/obstacle.hpp
index 498ba76..0f826e4 100644
--- a/src/obstacle.hpp
+++ b/src/obstacle.hpp
@@ -64,9 +64,9 @@ init_cyl_obst_kernel(word *buffer, size_t width, size_t height,
         const auto c_dist = lattice_vector_length_d(d_1, d_2, u1_x, u1_y, u2_x, u2_y);
         buffer[row*width + col] = (c_dist <= radius ? 1 : 0);
 
-        // if (row == 0 || row == height-1){
-        //     buffer[row*width + col] = 1;
-        // }
+        if (row == 0 || row == height-1){
+            buffer[row*width + col] = 1;
+        }
     }
 
     return;
diff --git a/tests/main_tests.cu b/tests/main_tests.cu
index 5dae639..053a264 100644
--- a/tests/main_tests.cu
+++ b/tests/main_tests.cu
@@ -11,6 +11,57 @@
 
 constexpr std::array<double, 2> base_velocity_vec { 1.0, 0.0 };
 
+void 
+aligned_rect(int width, int height, u8* buffer)
+{
+    const auto centre_x = width  / 2;
+    const auto centre_y = height / 2;
+
+    for(int i=0; i<height; i++)
+    {
+        for(int j=0; j<width; j++)
+            buffer[i*width+j] = 0;
+    }
+
+    for(int i= centre_y - 100; i<centre_y+100; i++)
+    {
+        for(int j=centre_x-5; j<centre_x+5; j++)
+            buffer[i*width+j] = 1;
+    }
+    return;
+}
+
+void 
+true_rect(int width, int height, u8* buffer)
+{
+    const auto centre_x = width  / 2;
+    const auto centre_y = height / 2;
+
+    int rh = 63, rw = 9;
+    
+    for(int i=0; i<height; i++)
+    {
+        for(int j=0; j<width; j++)
+            buffer[i*width+j] = 0;
+    }
+
+    int i = centre_y+rh/2, j = centre_x;
+    int base = centre_x;
+    for(int x=0; x<rh/2; x++){
+        for(j = -rw/2; j<= +rw/2; j++)
+            buffer[i*width + base + j] = 1;
+        i = i-1;
+        for(j = -rw/2; j< +rw/2; j++)
+            buffer[i*width + base + j] = 1;
+        i = i-1;
+        base = base-1;
+    }
+    for(j = -rw/2; j<= +rw/2; j++)
+    {
+        buffer[i*width + base + j] = 1;
+    }
+    return;
+}
 
 int main(int argc, char *argv[])
 {
@@ -38,7 +89,7 @@ int main(int argc, char *argv[])
     const auto base_v = velocity2(base_velocity_vec);
     const auto ch = generate_fhp1_velocities(base_v);
 
-    long seed = 1000;
+    long seed = 1024;
 
     // buffer for storing obstacle information
     u8 *buffer = new u8 [width * height];
@@ -47,10 +98,13 @@ int main(int argc, char *argv[])
     double *my = new double[width*height];
 
     // generating an obstacle
-    initialize_cylindrical_obstacle<u8>(buffer, width, height, centre_x, centre_y, radius);
+    // initialize_cylindrical_obstacle<u8>(buffer, width, height, centre_x, centre_y, radius);
+    true_rect(width, height, buffer);
+    // aligned_rect(width, height, buffer);
+
 
     // channel-wise occupancy probabilities for initialization
-    double h_prob[] = { 0.8, 0.7, 0.4, 0.1, 0.4, 0.7 };
+    double h_prob[] = { 0.7, 0.5, 0.2, 0.1, 0.2, 0.5 };
 
     const dim3 block_config(8, 8);
     const dim3 grid_config = make_tiles(block_config, width, height);
@@ -75,9 +129,15 @@ int main(int argc, char *argv[])
     fhp.create_csv(ipy, my);
     fhp.create_csv(iocpy, occup);
 
+    double *avx, *avy, *aoc;
+    cudaMalloc((void **) &avx, grid_config.x*grid_config.y*sizeof(double));
+    cudaMalloc((void **) &avy, grid_config.x*grid_config.y*sizeof(double));
+    cudaMalloc((void **) &aoc, grid_config.x*grid_config.y*sizeof(double));
+
     // time evolution
-    evolve<<<grid_config, block_config>>>(fhp.device_grid, fhp.state, fhp.width, fhp.height, 
-        timesteps, fhp.device_channels, fhp.mx, fhp.my, fhp.ocpy);
+    evolve_non_local<<<grid_config, block_config>>>(fhp.device_grid, fhp.state, fhp.width, 
+        fhp.height, timesteps, fhp.device_channels, fhp.mx, fhp.my, fhp.ocpy,
+        avx, avy, aoc);
     cudaDeviceSynchronize();
     gpuErrchk(cudaGetLastError());
 
@@ -94,7 +154,45 @@ int main(int argc, char *argv[])
     fhp.create_csv(py, my);
     fhp.create_csv(ocpy, occup);
 
+
+    std::ofstream apx("data/avx.csv"), apy("data/avy.csv"), aocp("data/aocpy.csv");
+
+    double* hpx = new double[grid_config.x*grid_config.y];
+    double* hpy = new double[grid_config.x*grid_config.y];
+    double* hoc = new double[grid_config.x*grid_config.y];
+
+    cudaMemcpy(hpx, avx, grid_config.x*grid_config.y*sizeof(double),
+        cudaMemcpyDeviceToHost);
+    cudaMemcpy(hpy, avy, grid_config.x*grid_config.y*sizeof(double),
+        cudaMemcpyDeviceToHost);
+    cudaMemcpy(hoc, aoc, grid_config.x*grid_config.y*sizeof(double),
+        cudaMemcpyDeviceToHost);
+
+    // for(int i=0; i< grid_config.x*grid_config.y; i++)
+    //     printf("%lf ", hpx[i]);
+    // printf("\n");
+    fhp.create_csv(apx, hpx, grid_config.x, grid_config.y);
+    fhp.create_csv(apy, hpy, grid_config.x, grid_config.y);
+    fhp.create_csv(aocp, hoc, grid_config.x, grid_config.y);
+    
     // TODO print output 
+    momentum<<<grid_config, block_config>>>(fhp.device_grid, fhp.device_channels, 
+        fhp.mx, fhp.my, fhp.ocpy, fhp.width);
+    cudaDeviceSynchronize();
+    gpuErrchk(cudaGetLastError( ));
+
+    std::ofstream fpx("data/fpx.csv"), fpy("data/fpy.csv"), focpy("data/foccupancy.csv");
+
+    fhp.get_output(buffer, mx, my, occup);
+
+    fhp.create_csv(fpx, mx);
+    fhp.create_csv(fpy, my);
+    fhp.create_csv(focpy, occup);
+
+    cudaFree(avx);
+    cudaFree(avy);
+    cudaFree(aoc);
+    delete[] hpx; delete[] hpy; delete[] hoc;
     delete[] occup;
     delete[] buffer;
     return 0;