--- device/fft1d.cl	2018-02-12 17:54:56.000000000 +0000
+++ device/fft1d.cl	2018-07-23 20:57:12.414000000 +0000
@@ -46,7 +46,7 @@
 // Include source code for an engine that produces 8 points each step
 #include "fft_8.cl" 
 
-#pragma OPENCL EXTENSION cl_intel_channels : enable
+//#pragma OPENCL EXTENSION cl_intel_channels : enable
 
 #include "../host/inc/fft_config.h"
 
@@ -66,11 +66,19 @@
 #define CONT_FACTOR            (1 << LOG_CONT_FACTOR)
 
 // Need some depth to our channels to accomodate their bursty filling.
-channel float2 chanin[8] __attribute__((depth(CONT_FACTOR*8)));
+
+pipe float2 chanin0 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
+pipe float2 chanin1 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
+pipe float2 chanin2 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
+pipe float2 chanin3 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
+pipe float2 chanin4 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
+pipe float2 chanin5 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
+pipe float2 chanin6 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
+pipe float2 chanin7 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
 
 uint bit_reversed(uint x, uint bits) {
   uint y = 0;
-  #pragma unroll 
+ __attribute__((opencl_unroll_hint()))
   for (uint i = 0; i < bits; i++) {
     y <<= 1;
     y |= x & 1;
@@ -139,15 +147,15 @@
 }
 
 // group dimension (N/(8*CONT_FACTOR), num_iterations)
-__attribute__((reqd_work_group_size(CONT_FACTOR * POINTS, 1, 1)))
-kernel void fetch (global float2 * restrict src) {
+kernel __attribute__((reqd_work_group_size(CONT_FACTOR * POINTS, 1, 1)))
+void fetch (global float2 * restrict src) {
 
   const int N = (1 << LOGN);
   // Each thread will fetch POINTS points. Need POINTS times to pass to FFT.
   const int BUF_SIZE = 1 << (LOG_CONT_FACTOR + LOGPOINTS + LOGPOINTS);
 
   // Local memory for CONT_FACTOR * POINTS points
-  local float2 buf[BUF_SIZE];
+  __local float2 buf[BUF_SIZE] __attribute__((xcl_array_partition(block,8,1)));
 
   uint iteration = get_global_id(1);
   uint group_per_iter = get_global_id(0);
@@ -158,17 +166,17 @@
   uint lid = get_local_id(0);
   uint local_addr = lid << LOGPOINTS;
 
-  #pragma unroll
-  for (uint k = 0; k < POINTS; k++) {
+  __attribute__((opencl_unroll_hint()))
+  for (uint k = 0; k < POINTS; k+=2) {
     buf[local_addr + k] = src[global_addr + k];
   }
 
   barrier (CLK_LOCAL_MEM_FENCE);
 
-  #pragma unroll
-  for (uint k = 0; k < POINTS; k++) {
-    uint buf_addr = bit_reversed(k,3) * CONT_FACTOR * POINTS + lid;
-    write_channel_intel (chanin[k], buf[buf_addr]);
+  uint buf_addr[8];
+  __attribute__((opencl_unroll_hint()))
+  for (uint k = 0; k < 8; k++) {
+    buf_addr[k] = bit_reversed(k,3) * CONT_FACTOR * POINTS + lid;
   }
 }
 
@@ -181,9 +189,24 @@
  * 'count' represents the number of 4k sets to process
  * 'inverse' toggles between the direct and the inverse transform
  */
+  // bit_reversed reverses the bit locations of the value given.
+  // The second parameter is the width of the number (in bits) to reverse.
+  // Only the non-symmetric numbers are changed.  E.g. 001,011,100,110 -> 100,110,100,110
+
+
+
+    write_pipe(chanin0, &buf[buf_addr[0]]);
+    write_pipe(chanin1, &buf[buf_addr[1]]);
+    write_pipe(chanin2, &buf[buf_addr[2]]);
+    write_pipe(chanin3, &buf[buf_addr[3]]);
+    write_pipe(chanin4, &buf[buf_addr[4]]);
+    write_pipe(chanin5, &buf[buf_addr[5]]);
+    write_pipe(chanin6, &buf[buf_addr[6]]);
+    write_pipe(chanin7, &buf[buf_addr[7]]);
+
 
-__attribute((task))
-kernel void fft1d(global float2 * restrict dest,
+kernel __attribute((reqd_work_group_size(1, 1, 1))) //task))
+void fft1d(global float2 * restrict dest,
                   int count, int inverse) {
 
   const int N = (1 << LOGN);
@@ -220,14 +243,14 @@
     float2x8 data;
     // Perform memory transfers only when reading data in range
     if (i < count * (N / 8)) {
-      data.i0 = read_channel_intel(chanin[0]);
-      data.i1 = read_channel_intel(chanin[1]);
-      data.i2 = read_channel_intel(chanin[2]);
-      data.i3 = read_channel_intel(chanin[3]);
-      data.i4 = read_channel_intel(chanin[4]);
-      data.i5 = read_channel_intel(chanin[5]);
-      data.i6 = read_channel_intel(chanin[6]);
-      data.i7 = read_channel_intel(chanin[7]);
+      read_pipe(chanin0,&data.i0);
+      read_pipe(chanin1,&data.i1);
+      read_pipe(chanin2,&data.i2);
+      read_pipe(chanin3,&data.i3);
+      read_pipe(chanin4,&data.i4);
+      read_pipe(chanin5,&data.i5);
+      read_pipe(chanin6,&data.i6);
+      read_pipe(chanin7,&data.i7);
     } else {
       data.i0 = data.i1 = data.i2 = data.i3 = 
                 data.i4 = data.i5 = data.i6 = data.i7 = 0;