diff --git a/Kernel/Applications/Makefile.am b/Kernel/Applications/Makefile.am
index 3e39fb8c..bea9e349 100644
--- a/Kernel/Applications/Makefile.am
+++ b/Kernel/Applications/Makefile.am
@@ -1,11 +1,12 @@
 
-bin_PROGRAMS = load_bits
+bin_PROGRAMS = load_bits digihdr
 
 check_PROGRAMS = test_Input test_Unpack
 
 load_bits_SOURCES = load_bits.C
 test_Input_SOURCES = test_Input.C
 test_Unpack_SOURCES = test_Unpack.C
+digihdr_SOURCES = digihdr.C
 
 if HAVE_sigproc
   check_PROGRAMS += sigproc_header
diff --git a/Kernel/Applications/digihdr.C b/Kernel/Applications/digihdr.C
index e69f76f8..0517c411 100644
--- a/Kernel/Applications/digihdr.C
+++ b/Kernel/Applications/digihdr.C
@@ -6,6 +6,7 @@
  ***************************************************************************/
 
 #include <iostream>
+#include <unistd.h>
 
 #include "dsp/File.h"
 #include "TextInterface.h"
diff --git a/Kernel/Classes/BitUnpacker.C b/Kernel/Classes/BitUnpacker.C
index fda7a788..df3b1dcc 100644
--- a/Kernel/Classes/BitUnpacker.C
+++ b/Kernel/Classes/BitUnpacker.C
@@ -59,7 +59,7 @@ void dsp::BitUnpacker::unpack ()
 
   // Step through the array in small block sizes so that the matrix
   // transpose (for nchan>1 case) remains cache-friendly.
-  const unsigned blockdat = npol*nchan*ndim > 32 ? npol*nchan*ndim : 32;
+  const unsigned blockdat = npol*nchan*ndim*8/nbit > 32 ? npol*nchan*ndim*8/nbit : 32;
   const unsigned blockbytes = blockdat*nbit/8;
 
   const unsigned char* iptr = input->get_rawptr();
@@ -67,25 +67,22 @@ void dsp::BitUnpacker::unpack ()
   for (uint64_t idat=0; idat<ndat; idat+=blockdat, iptr+=blockbytes)
   {
     unsigned offset = 0;
-    const unsigned ndatblock = (blockdat>(ndat-idat)) ? ndat-idat : blockdat;
+    const unsigned ndatblock = (blockdat>(ndat-idat)) ? ndat-idat : blockdat/nskip;
     for (unsigned ichan=0; ichan<nchan; ichan++)
     {
       for (unsigned ipol=0; ipol<npol; ipol++)
       {
         for (unsigned idim=0; idim<ndim; idim++)
         {
-          //offset = idim + ipol*ndim + ichan*npol*ndim;
-          //const unsigned char* from = input->get_rawptr() + offset;
           const unsigned char* from = iptr + offset;
           float* into = output->get_datptr (ichan, ipol) + ndim*idat + idim;
           unsigned long* hist = get_histogram (offset);
-   
+
 #ifdef _DEBUG
           cerr << "c=" << ichan << " p=" << ipol << " d=" << idim << endl;
 #endif
  
           unpack (ndatblock, from, nskip, into, fskip, hist);
-          //unpack (ndat, from, nskip, into, fskip, hist);
           offset ++;
         }
       }
diff --git a/Kernel/Classes/DummyFile.C b/Kernel/Classes/DummyFile.C
index db7367e1..1dd22ba5 100644
--- a/Kernel/Classes/DummyFile.C
+++ b/Kernel/Classes/DummyFile.C
@@ -56,6 +56,12 @@ void dsp::DummyFile::open_file (const char* filename)
   
   // Read obs info from ASCII file
   info = new ASCIIObservation(header);
+  
+  // cannot load less than a byte. set the time sample resolution accordingly
+  unsigned bits_per_byte = 8;
+  resolution = bits_per_byte / get_info()->get_nbit();
+  if (resolution == 0)
+    resolution = 1;
 }
 
 void dsp::DummyFile::close ()
diff --git a/Kernel/Classes/MultiFile.C b/Kernel/Classes/MultiFile.C
index 45071354..cea5e115 100644
--- a/Kernel/Classes/MultiFile.C
+++ b/Kernel/Classes/MultiFile.C
@@ -17,7 +17,7 @@
 
 using namespace std;
 
-dsp::MultiFile::MultiFile () : File ("MultiFile")
+dsp::MultiFile::MultiFile (const char* name) : File (name)
 {
   test_contiguity = true;
   current_index = 0;
diff --git a/Kernel/Classes/Multiplex.C b/Kernel/Classes/Multiplex.C
index ea694cf2..7b777377 100644
--- a/Kernel/Classes/Multiplex.C
+++ b/Kernel/Classes/Multiplex.C
@@ -1,7 +1,7 @@
 //-*-C++-*-
 /***************************************************************************
  *
- *   Copyright (C) 2009
+ *   Copyright (C) 2009 by Jonathon Kocz
  *   Licensed under the Academic Free License version 2.1
  *
  ***************************************************************************/
@@ -18,7 +18,7 @@
 
 using namespace std;
 
-dsp::Multiplex::Multiplex () : MultiFile ()
+dsp::Multiplex::Multiplex () : MultiFile ( "Multiplex" )
 {
   //current_index = 0;
 }
diff --git a/Kernel/Classes/ObservationInterface.C b/Kernel/Classes/ObservationInterface.C
index c9edb6bf..dc2e6b0d 100644
--- a/Kernel/Classes/ObservationInterface.C
+++ b/Kernel/Classes/ObservationInterface.C
@@ -13,6 +13,7 @@ dsp::Observation::Interface::Interface( Observation *c )
   add( &Observation::get_nchan, "nchan", "Number of frequency channels" );
   add( &Observation::get_npol,  "npol",  "Number of polarizations" );
   add( &Observation::get_ndim,  "ndim",  "Number of data dimensions" );
+  add( &Observation::get_nbit,  "nbit",  "Number of bits per datum" );
 
   add( &Observation::get_type,
        &Observation::set_type,
diff --git a/Kernel/Classes/dsp/MultiFile.h b/Kernel/Classes/dsp/MultiFile.h
index 97809f31..a3667a66 100644
--- a/Kernel/Classes/dsp/MultiFile.h
+++ b/Kernel/Classes/dsp/MultiFile.h
@@ -27,7 +27,7 @@ namespace dsp {
   public:
   
     //! Constructor
-    MultiFile ();
+    MultiFile (const char* name = "MultiFile");
     
     //! Destructor
     virtual ~MultiFile ();
diff --git a/Kernel/Classes/dsp/Multiplex.h b/Kernel/Classes/dsp/Multiplex.h
index 285565ef..f0a89e9f 100644
--- a/Kernel/Classes/dsp/Multiplex.h
+++ b/Kernel/Classes/dsp/Multiplex.h
@@ -1,16 +1,12 @@
 //-*-C++-*-
 /***************************************************************************
  *
- *   Copyright (C) 2002 by Willem van Straten
+ *   Copyright (C) 2002 by Jonathon Kocz
  *   Licensed under the Academic Free License version 2.1
  *
  ***************************************************************************/
 
-/* $Source: /cvsroot/dspsr/dspsr/Kernel/Classes/dsp/Multiplex.h,v $
-   $Revision: 1.1 $
-   $Date: 2009/09/10 01:02:34 $
-   $Author: tcaotiaafoc $ */
-
+// dspsr/Kernel/Classes/dsp/Multiplex.h
 
 #ifndef __Multiplex_h
 #define __Multiplex_h
diff --git a/Kernel/Formats/Unpacker_registry.C b/Kernel/Formats/Unpacker_registry.C
index 9f7fa3e4..d0d3d921 100644
--- a/Kernel/Formats/Unpacker_registry.C
+++ b/Kernel/Formats/Unpacker_registry.C
@@ -102,6 +102,11 @@ static dsp::Unpacker::Register::Enter<dsp::GUPPIFourBit> guppi4;
 static dsp::Unpacker::Register::Enter<dsp::GUPPITwoBitCorrection> guppi2;
 #endif
 
+#if HAVE_kat
+#include "dsp/KAT7Unpacker.h"
+static dsp::Unpacker::Register::Enter<dsp::KAT7Unpacker> kat7;
+#endif
+
 #if HAVE_lofar_dal
 #include "dsp/LOFAR_DALUnpacker.h"
 static dsp::Unpacker::Register::Enter<dsp::LOFAR_DALUnpacker> lofar_dal;
diff --git a/Kernel/Formats/fits/FITSFile.C b/Kernel/Formats/fits/FITSFile.C
index 66f59f43..10b690bc 100644
--- a/Kernel/Formats/fits/FITSFile.C
+++ b/Kernel/Formats/fits/FITSFile.C
@@ -154,6 +154,8 @@ void dsp::FITSFile::open_file(const char* filename)
   std::string backend_name = archive->get<Pulsar::Backend>()->get_name();
   if (backend_name == "GUPPI" || backend_name == "PUPPI")
     get_info()->set_machine("GUPPIFITS");
+  else if (backend_name == "COBALT")
+    get_info()->set_machine("COBALT");
   else
     get_info()->set_machine("FITS");
   get_info()->set_telescope(archive->get_telescope());
diff --git a/Kernel/Formats/fits/Makefile.am b/Kernel/Formats/fits/Makefile.am
index bb2e3cb9..f1eb1505 100644
--- a/Kernel/Formats/fits/Makefile.am
+++ b/Kernel/Formats/fits/Makefile.am
@@ -1,12 +1,12 @@
 noinst_LTLIBRARIES = libfits.la
 
 nobase_include_HEADERS = dsp/FITSUnpacker.h dsp/FITSFile.h \
-			 dsp/GUPPIFITSUnpacker.h dsp/FITSDigitizer.h \
-             dsp/FITSOutputFile.h
+			 dsp/GUPPIFITSUnpacker.h dsp/FITSDigitizer.h 
+             #dsp/FITSOutputFile.h
 
 libfits_la_SOURCES = FITSUnpacker.C GUPPIFITSUnpacker.C \
-		     FITSFile.C fits_params.h FITSDigitizer.C\
-             FITSOutputFile.C
+		     FITSFile.C fits_params.h FITSDigitizer.C
+             #FITSOutputFile.C
 
 #############################################################################
 #
diff --git a/Kernel/Formats/gmrt/GMRTFile.C b/Kernel/Formats/gmrt/GMRTFile.C
index fcf1d27f..d3c202b6 100644
--- a/Kernel/Formats/gmrt/GMRTFile.C
+++ b/Kernel/Formats/gmrt/GMRTFile.C
@@ -133,9 +133,11 @@ void dsp::GMRTFile::open_file (const char* filename)
   
   fd = open_read_header (filename, &header, &block);
  
-  cerr << "n_ds = " << header.n_ds << endl;
-  cerr << "n_chan = " << header.n_chan << endl;
-  cerr << "i_chan = " << block.FreqChanNo << endl;
+  if (verbose) {
+   cerr << "n_ds = " << header.n_ds << endl;
+   cerr << "n_chan = " << header.n_chan << endl;
+   cerr << "i_chan = " << block.FreqChanNo << endl;
+  }
 
   get_info()->set_nbit (8);
 
@@ -143,8 +145,10 @@ void dsp::GMRTFile::open_file (const char* filename)
   get_info()->set_bandwidth (bw);
   get_info()->set_centre_frequency (header.rf + (block.FreqChanNo + 0.5) * bw);
  
-  cerr << "cf = " << get_info()->get_centre_frequency() << endl;
-  cerr << "bw = " << bw << endl;
+  if (verbose) {
+   cerr << "cf = " << get_info()->get_centre_frequency() << endl;
+   cerr << "bw = " << bw << endl;
+  }
 
   get_info()->set_npol(2);
   get_info()->set_state (Signal::Analytic);
@@ -154,18 +158,22 @@ void dsp::GMRTFile::open_file (const char* filename)
   epoch += block.ipts1 / get_info()->get_rate();
 
   get_info()->set_start_time( epoch );
-  cerr << "MJD = " << get_info()->get_start_time() << endl;
-  cerr << "telescope = " << header.telescope << endl;
+  if (verbose) {
+   cerr << "MJD = " << get_info()->get_start_time() << endl;
+   cerr << "telescope = " << header.telescope << endl;
+  }
   get_info()->set_telescope (header.telescope);
 
   get_info()->set_source (header.psr_name);
 
   header_bytes = sizeof(struct gmrt_params);
 
-  cerr << "totalsize=" << block.totalsize << endl;
-  cerr << "NPtsSend=" << block.NPtsSend << endl;
-  cerr << "overlap=" << header.overlap << endl;
-  cerr << "n_samp_dump=" << header.n_samp_dump << endl;
+  if (verbose) {
+   cerr << "totalsize=" << block.totalsize << endl;
+   cerr << "NPtsSend=" << block.NPtsSend << endl;
+   cerr << "overlap=" << header.overlap << endl;
+   cerr << "n_samp_dump=" << header.n_samp_dump << endl;
+  }
 
   block_header_bytes = sizeof(struct data2rcv);
   //block_tailer_bytes = header.overlap * 4;
diff --git a/Kernel/Formats/kat/KAT7Unpacker.C b/Kernel/Formats/kat/KAT7Unpacker.C
new file mode 100644
index 00000000..5a7ac12a
--- /dev/null
+++ b/Kernel/Formats/kat/KAT7Unpacker.C
@@ -0,0 +1,179 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2015 Andrew Jameson
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "dsp/KAT7Unpacker.h"
+#include "dsp/BitTable.h"
+
+#include "Error.h"
+
+#if HAVE_CUDA
+#include "dsp/MemoryCUDA.h"
+#include "dsp/KAT7UnpackerCUDA.h"
+#include <cuda_runtime.h>
+#endif
+
+#include <errno.h>
+
+using namespace std;
+
+static void* const undefined_stream = (void *) -1;
+
+dsp::KAT7Unpacker::KAT7Unpacker (const char* _name) : HistUnpacker (_name)
+{
+  if (verbose)
+    cerr << "dsp::KAT7Unpacker ctor" << endl;
+
+  set_nstate (256);
+  gpu_stream = undefined_stream;
+
+  table = new BitTable (8, BitTable::TwosComplement);
+
+  device_prepared = false;
+}
+
+dsp::KAT7Unpacker::~KAT7Unpacker ()
+{
+}
+
+dsp::KAT7Unpacker * dsp::KAT7Unpacker::clone () const
+{
+  return new KAT7Unpacker (*this);
+}
+
+//! Return true if the unpacker can operate on the specified device
+bool dsp::KAT7Unpacker::get_device_supported (Memory* memory) const
+{
+#if HAVE_CUDA
+  if (verbose)
+    cerr << "dsp::KAT7Unpacker::get_device_supported HAVE_CUDA" << endl;
+  return dynamic_cast< CUDA::DeviceMemory*> ( memory );
+#else
+  return false;
+#endif
+}
+
+//! Set the device on which the unpacker will operate
+void dsp::KAT7Unpacker::set_device (Memory* memory)
+{
+#if HAVE_CUDA
+  CUDA::DeviceMemory * gpu_mem = dynamic_cast< CUDA::DeviceMemory*>( memory );
+  if (gpu_mem)
+  {
+    gpu_stream = (void *) gpu_mem->get_stream();
+    if (verbose)
+      cerr << "dsp::KAT7Unpacker::set_device using gpu memory" << endl;
+    staging.set_memory( memory );
+  }
+  else
+  {
+    if (verbose)
+      cerr << "dsp::KAT7Unpacker::set_device using cpu memory" << endl;
+    gpu_stream = undefined_stream;
+  }
+#else
+  Unpacker::set_device (memory);
+#endif
+  device_prepared = true;
+}
+
+
+bool dsp::KAT7Unpacker::matches (const Observation* observation)
+{
+  return observation->get_machine() == "KPSR"
+    && observation->get_ndim() == 2
+    && observation->get_nbit() == 8;
+}
+
+void dsp::KAT7Unpacker::unpack ()
+{
+#if HAVE_CUDA
+  if (gpu_stream != undefined_stream)
+  {
+    unpack_on_gpu ();
+    return;
+  }
+#endif
+
+  // some programs (digifil) do not call set_device
+  if ( ! device_prepared )
+    set_device ( Memory::get_manager ());
+
+  const uint64_t ndat  = input->get_ndat();
+  const int8_t * from = (int8_t *) input->get_rawptr();
+  float * into;
+  const unsigned nchan = input->get_nchan();
+  const unsigned npol = 1;
+
+  const float* lookup = table->get_values ();
+
+  // data is stored as 128 sample blocks of FT ordered data
+  const uint64_t nblocks = ndat / 128;
+
+  // cerr << "dsp::KAT7Unpacker::unpack ndat="<<ndat << " nchan=" << nchan << " nblocks=" << nblocks << endl;
+
+  for (uint64_t iblock=0; iblock<nblocks; iblock++)
+  {
+    for (unsigned ichan=0; ichan<nchan; ichan++)
+    {
+      for (unsigned ipol=0; ipol<npol; ipol++)
+      {
+        into = output->get_datptr (ichan, ipol) + (iblock*256);
+	for (unsigned isamp=0; isamp<256; isamp++)
+        {
+          into[isamp] = (float) from[isamp];
+        }
+        from += 256;
+      }
+    }
+  }
+}
+
+unsigned dsp::KAT7Unpacker::get_resolution () const { return 128; }
+
+#if HAVE_CUDA
+
+void dsp::KAT7Unpacker::unpack_on_gpu ()
+{
+  const uint64_t ndat = input->get_ndat();
+  const unsigned nchan = input->get_nchan();
+  const unsigned ndim  = input->get_ndim();
+  const unsigned npol  = input->get_npol();
+
+  const uint64_t to_copy = ndat * nchan * ndim * npol;
+
+  staging.Observation::operator=( *input );
+  staging.resize(ndat);
+
+  // staging buffer on the GPU for packed data
+  unsigned char * d_staging = staging.get_rawptr();
+  const unsigned char* from = input->get_rawptr();
+  float * into = output->get_datptr(0,0);
+
+  cudaStream_t stream = (cudaStream_t) gpu_stream;
+  cudaError error;
+
+  if (stream)
+    error = cudaMemcpyAsync (d_staging, from, to_copy,
+                             cudaMemcpyHostToDevice, stream);
+  else
+    error = cudaMemcpy (d_staging, from, to_copy, cudaMemcpyHostToDevice);
+
+  if (error != cudaSuccess)
+    throw Error (FailedCall, "KAT7Unpacker::unpack_on_gpu",
+                 "cudaMemcpy%s %s", stream?"Async":"", 
+                 cudaGetErrorString (error));
+
+  kat7_unpack (stream, ndat, nchan, npol, table->get_scale(), (int16_t *) d_staging, into);
+}
+
+#endif
+
diff --git a/Kernel/Formats/kat/KAT7UnpackerCUDA.cu b/Kernel/Formats/kat/KAT7UnpackerCUDA.cu
new file mode 100644
index 00000000..5c3ffe88
--- /dev/null
+++ b/Kernel/Formats/kat/KAT7UnpackerCUDA.cu
@@ -0,0 +1,91 @@
+//-*-C++-*-
+/***************************************************************************
+ *
+ *   Copyright (C) 2010 by Willem van Straten
+ *   Licensed under the Academic Free License version 2.1
+ *
+ ***************************************************************************/
+
+#include "dsp/KAT7UnpackerCUDA.h"
+#include "dsp/Operation.h"
+
+#include "Error.h"
+
+#include <cuComplex.h>
+
+//#define _DEBUG 
+
+using namespace std;
+
+void check_error (const char*);
+
+// each thread unpacks samples so that 1 warp does 128 contiguous samples
+__global__ void kat7_unpack_fpt_kernel (const uint64_t ndat, float scale, const int16_t * input, cuFloatComplex * output)
+{
+  const int warp_idx = threadIdx.x & 0x1F;	// threadIDx.x % 32
+  const int warp_num = threadIdx.x / 32;
+
+  const unsigned ichan = blockIdx.y;
+  const unsigned nchan = gridDim.y;
+  const unsigned ichan_offset = (ichan * 128);
+
+  // first sample for the start of the warp
+  unsigned isamp = (blockIdx.x * blockDim.x * 4) + (warp_num * 128) + warp_idx;
+  unsigned idx = (blockIdx.x * blockDim.x * 4 * nchan) + (warp_num * nchan * 128) + ichan_offset + warp_idx;
+  unsigned odx = (ichan * ndat) + isamp;
+
+  int16_t val16;
+  int8_t * val8 = (int8_t *) &val16;
+  cuFloatComplex val64;
+
+  for (unsigned ival=0; ival<4; ival++)
+  {
+    if (isamp < ndat)
+    {
+      val16 = input[idx];
+      val64.x = ((float) val8[0] + 0.5) * scale;
+      val64.y = ((float) val8[1] + 0.5) * scale;
+      output[odx] = val64;
+
+      idx += 32;
+      odx += 32;
+      isamp += 32;
+    }
+  }
+
+}
+
+void kat7_unpack (cudaStream_t stream, const uint64_t ndat, unsigned nchan, unsigned npol,
+                  float scale, const int16_t * input, float * output)
+{
+  int nthread = 1024;
+
+  const unsigned ndat_per_block = 4 * nthread;
+
+  // each thread will unpack 4 time samples
+  dim3 blocks = dim3 (ndat / ndat_per_block, nchan);
+
+  if (ndat % ndat_per_block != 0)
+    blocks.x++;
+
+#ifdef _DEBUG
+  cerr << "kat7_unpack ndat=" << ndat << " scale=" << scale 
+       << " input=" << (void*) input << " nblock=(" << blocks.x << "," << blocks.y << ")"
+       << " nthread=" << nthread << endl;
+#endif
+
+  kat7_unpack_fpt_kernel<<<blocks,nthread,0,stream>>> (ndat, scale, input, (cuFloatComplex *) output);
+
+  // AJ's theory... 
+  // If there are no stream synchronises on the input then the CPU pinned memory load from the
+  // input class might be able to get ahead of a whole sequence of GPU operations, and even exceed
+  // one I/O loop. Therefore this should be a reuqirement to have a stream synchronize some time
+  // after the data are loaded from pinned memory to GPU ram and the next Input copy to pinned memory
+
+  if (dsp::Operation::record_time || dsp::Operation::verbose)
+    check_error ("kat7_unpack_fpt_kernel");
+
+  // put it here for now
+  cudaStreamSynchronize(stream);
+
+}
diff --git a/Kernel/Formats/kat/Makefile.am b/Kernel/Formats/kat/Makefile.am
new file mode 100644
index 00000000..5ec339b8
--- /dev/null
+++ b/Kernel/Formats/kat/Makefile.am
@@ -0,0 +1,21 @@
+
+noinst_LTLIBRARIES = libkat.la
+
+nobase_include_HEADERS =  dsp/KAT7Unpacker.h
+
+libkat_la_SOURCES = KAT7Unpacker.C
+
+if HAVE_CUDA
+
+nobase_include_HEADERS += dsp/KAT7UnpackerCUDA.h
+libkat_la_SOURCES += KAT7UnpackerCUDA.cu
+
+endif
+
+#############################################################################
+#
+
+include $(top_srcdir)/config/Makefile.include
+include $(top_srcdir)/config/Makefile.cuda
+
+AM_CPPFLAGS += @CUDA_CFLAGS@
diff --git a/Kernel/Formats/kat/dsp/KAT7Unpacker.h b/Kernel/Formats/kat/dsp/KAT7Unpacker.h
new file mode 100644
index 00000000..b13091cb
--- /dev/null
+++ b/Kernel/Formats/kat/dsp/KAT7Unpacker.h
@@ -0,0 +1,55 @@
+/*
+
+ */
+
+#ifndef __dsp_KAT7Unpacker_h
+#define __dsp_KAT7Unpacker_h
+
+#include "dsp/EightBitUnpacker.h"
+
+namespace dsp {
+  
+  class KAT7Unpacker : public HistUnpacker
+  {
+  public:
+
+    //! Constructor
+    KAT7Unpacker (const char* name = "KAT7Unpacker");
+    ~KAT7Unpacker ();
+
+    //! Cloner (calls new)
+    virtual KAT7Unpacker * clone () const;
+
+    //! Return true if the unpacker can operate on the specified device
+    bool get_device_supported (Memory*) const;
+
+    //! Set the device on which the unpacker will operate
+    void set_device (Memory*);
+
+  protected:
+    
+    Reference::To<BitTable> table;
+
+    //! Return true if we can convert the Observation
+    bool matches (const Observation* observation);
+
+    void unpack ();
+
+    void unpack (uint64_t ndat, const unsigned char* from, 
+		             float* into, const unsigned fskip,
+		             unsigned long* hist);
+
+    BitSeries staging;
+    void * gpu_stream;
+    void unpack_on_gpu ();
+
+    unsigned get_resolution ()const ;
+
+  private:
+
+    bool device_prepared;
+
+  };
+}
+
+#endif
diff --git a/Kernel/Formats/kat/dsp/KAT7UnpackerCUDA.h b/Kernel/Formats/kat/dsp/KAT7UnpackerCUDA.h
new file mode 100644
index 00000000..8e2a9169
--- /dev/null
+++ b/Kernel/Formats/kat/dsp/KAT7UnpackerCUDA.h
@@ -0,0 +1,14 @@
+/*
+
+ */
+
+#ifndef __dsp_KAT7UnpackerCUDA_h
+#define __dsp_KAT7UnpackerCUDA_h
+
+#include<stdint.h>
+#include<cuda_runtime.h>
+
+void kat7_unpack (cudaStream_t stream, const uint64_t ndat, unsigned nchan, unsigned npol,
+                  float scale, const int16_t * input, float * output);
+
+#endif
diff --git a/Kernel/Formats/lofar_dal/LOFAR_DALFile.C b/Kernel/Formats/lofar_dal/LOFAR_DALFile.C
index 78fd9180..4441f607 100644
--- a/Kernel/Formats/lofar_dal/LOFAR_DALFile.C
+++ b/Kernel/Formats/lofar_dal/LOFAR_DALFile.C
@@ -411,8 +411,6 @@ void dsp::LOFAR_DALFile::open_file (const char* filename)
     }
   }
 
-  get_info()->set_machine( "LOFAR" );
-
   // OPEN ALL FILES
 
   handle = new Handle;
diff --git a/Kernel/Formats/lofar_dal/LOFAR_DALUnpacker.C b/Kernel/Formats/lofar_dal/LOFAR_DALUnpacker.C
index 28068eef..18190087 100644
--- a/Kernel/Formats/lofar_dal/LOFAR_DALUnpacker.C
+++ b/Kernel/Formats/lofar_dal/LOFAR_DALUnpacker.C
@@ -27,7 +27,7 @@ bool dsp::LOFAR_DALUnpacker::matches (const Observation* observation)
 {
   return
     observation->get_nbit() == 32 && 
-    observation->get_machine() == "LOFAR";
+    observation->get_machine() == "COBALT";
 }
 
 //! Return true if the unpacker support the specified output order
diff --git a/Kernel/Formats/sigproc/SigProcDigitizer.C b/Kernel/Formats/sigproc/SigProcDigitizer.C
index b1c4e13e..db66f024 100644
--- a/Kernel/Formats/sigproc/SigProcDigitizer.C
+++ b/Kernel/Formats/sigproc/SigProcDigitizer.C
@@ -53,12 +53,15 @@ public:
   inline unsigned operator () (unsigned out_chan)
   {
     unsigned in_chan = out_chan;
-    if (flip_band)
-      in_chan = (nchan-in_chan-1);
+    //if (flip_band)
+    //  in_chan = (nchan-in_chan-1);
     if (input->get_nsub_swap() > 1) 
       in_chan = input->get_unswapped_ichan(out_chan);
     else if (swap_band)
       in_chan = (in_chan+half_chan)%nchan;
+    // moved from the start of the block 
+    if (flip_band)
+      in_chan = (nchan-in_chan-1);
     return in_chan;
   }
 };
diff --git a/Kernel/Formats/sigproc/SigProcObservation.C b/Kernel/Formats/sigproc/SigProcObservation.C
index b36d5738..409a68bf 100644
--- a/Kernel/Formats/sigproc/SigProcObservation.C
+++ b/Kernel/Formats/sigproc/SigProcObservation.C
@@ -86,6 +86,8 @@ static std::string get_sigproc_telescope_name (int _id)
       return "GMRT";
     case 8:
       return "Effelsberg";
+    case 11:
+      return "LOFAR";
     default:
       return "unknown";
       break;
@@ -119,6 +121,7 @@ static int get_sigproc_telescope_id (string name)
     else if (itoa == "GB") return 6;
     else if (itoa == "GM") return 7;
     else if (itoa == "EF") return 8;
+    else if (itoa == "LF") return 11;
     else return 0;
   }
   catch (Error &error)
@@ -129,6 +132,41 @@ static int get_sigproc_telescope_id (string name)
   return 0;
 }
 
+static std::string get_sigproc_machine_name (int _id)
+{
+  // Info from sigproc's aliases.c
+  switch (_id) {
+    case 0:
+      return "FAKE";
+    case 1:
+      return "PSPM";
+    case 2:
+      return "WAPP";
+    case 3:
+      return "AOFTM";
+    case 4:
+      return "BPP";
+    case 5:
+      return "OOTY";
+    case 6:
+      return "SCAMP";
+    case 7:
+      return "GMRTFB";
+    case 8:
+      return "PULSAR2000";
+    case 10:
+      return "ARTEMIS";
+    case 11:
+      return "COBALT";
+    default:
+      return "?????";
+      break;
+  }
+
+  return "?????";
+}
+
+
 void dsp::SigProcObservation::load_global ()
 {
   // set_receiver (buffer);
@@ -163,7 +201,8 @@ void dsp::SigProcObservation::load_global ()
   coord.dec().setDegMS (src_dej);
   set_coordinates (coord);
 
-  set_machine ("SigProc");
+  // set_machine ("SigProc");
+  set_machine ( get_sigproc_machine_name(machine_id) );
   set_telescope ( get_sigproc_telescope_name(telescope_id) );
 }
 
@@ -193,6 +232,7 @@ void dsp::SigProcObservation::unload_global ()
   */
   if(get_machine().compare("BPSR")==0)machine_id=10;
   else if(get_machine().compare("SCAMP")==0)machine_id=6;
+  else if(get_machine().compare("COBALT")==0)machine_id=11;
 
   // This is the 'rawfilename' parameter in the header.
   // inpfile is possibly uninitialized here so avoid setting
diff --git a/Kernel/Formats/sigproc/read_header.c b/Kernel/Formats/sigproc/read_header.c
index 28a3b53d..54cdbda8 100644
--- a/Kernel/Formats/sigproc/read_header.c
+++ b/Kernel/Formats/sigproc/read_header.c
@@ -17,9 +17,9 @@ void get_string(FILE *inputfile, int *nbytes, char string[]) /* includefile */
   int nchar;
   strcpy(string,"ERROR");
   fread(&nchar, sizeof(int), 1, inputfile);
+  *nbytes=sizeof(int);
   if (feof(inputfile)) exit(0);
   if (nchar>80 || nchar<1) return;
-  *nbytes=sizeof(int);
   fread(string, nchar, 1, inputfile);
   string[nchar]='\0';
   *nbytes+=nchar;
diff --git a/Signal/General/Detection.C b/Signal/General/Detection.C
index f6849c26..9e79e02a 100644
--- a/Signal/General/Detection.C
+++ b/Signal/General/Detection.C
@@ -189,6 +189,7 @@ void dsp::Detection::resize_output ()
     get_output()->set_npol( output_npol );
     get_output()->set_ndim( output_ndim );
     get_output()->resize( get_input()->get_ndat() );
+    get_output()->set_zeroed_data (input->get_zeroed_data());
   }
   else if (reshape)
   {
@@ -222,9 +223,12 @@ void dsp::Detection::square_law ()
     cerr << "dsp::Detection::square_law" << endl;
 
   if (engine)
-    throw Error (InvalidState, "dsp::Detection::square_law",
-                 "square law detection not yet implemented for the GPU");
-
+  {
+    if (verbose)
+      cerr << "dsp::Detection::square_law using Engine engine=" << (void *) engine << endl;
+    engine->square_law (input, output);
+    return;
+  }
   const unsigned nchan = input->get_nchan();
   const unsigned npol = input->get_npol();
   
diff --git a/Signal/General/DetectionCUDA.cu b/Signal/General/DetectionCUDA.cu
index 3b305026..985484ed 100644
--- a/Signal/General/DetectionCUDA.cu
+++ b/Signal/General/DetectionCUDA.cu
@@ -22,6 +22,7 @@
 using namespace std;
 
 void check_error (const char*);
+void check_error_stream (const char*, cudaStream_t);
 
 /*
   PP   = p^* p
@@ -172,3 +173,116 @@ void CUDA::DetectionEngine::polarimetry (unsigned ndim,
     check_error ("CUDA::DetectionEngine::polarimetry");
 }
 
+// dubiuous about the correctness here... TODO AJ
+__global__ void sqld_tfp (float2 *base_in, unsigned stride_in,
+                          float * base_out, unsigned stride_out, unsigned ndat)
+{
+  // input and output pointers for channel (y dim)
+  float2 * in = base_in + (blockIdx.y * stride_in);
+  float * out = base_out + (blockIdx.y * stride_out);
+
+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  out[i] = in[i].x * in[i].x + in[i].y * in[i].y;
+}
+
+__global__ void sqld_fpt (float2 *base_in, float *base_out, uint64_t ndat)
+{
+  // set base pointer for ichan [blockIdx.y], input complex, output detected, npol 1
+  float2 * in = base_in + (blockIdx.y * ndat);
+  float * out = base_out + (blockIdx.y * ndat);
+
+  // the sample for the channel
+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  out[i] = in[i].x * in[i].x + in[i].y * in[i].y;
+}
+
+void CUDA::DetectionEngine::square_law (const dsp::TimeSeries* input,
+           dsp::TimeSeries* output)
+{
+  uint64_t ndat  = input->get_ndat ();
+  unsigned nchan = input->get_nchan ();
+  unsigned ndim  = input->get_ndim();
+  unsigned npol  = input->get_npol();
+
+  if (ndim != 2)
+    throw Error (InvalidParam, "CUDA::DetectionEngine::square_law",
+     "cannot handle ndim=%u != 2", ndim);
+
+  if (npol != 1)
+    throw Error (InvalidParam, "CUDA::DetectionEngine::square_law",
+     "cannot handle npol=%u != 1", ndim);
+
+  if (input == output)
+    throw Error (InvalidParam, "CUDA::DetectionEngine::square_law"
+     "cannot handle in-place data");
+
+/*
+  if (input->get_order() == dsp::TimeSeries::OrderTFP)
+    cerr << "CUDA::DetectionEngine::square_law input->get_order=TFP" << endl;
+  if (output->get_order() == dsp::TimeSeries::OrderTFP)
+    cerr << "CUDA::DetectionEngine::square_law output->get_order=TFP" << endl;
+*/
+
+  switch (input->get_order())
+  {
+    case dsp::TimeSeries::OrderTFP:
+    {
+      dim3 threads (512);
+      dim3 blocks (ndat/threads.x, nchan);
+
+      if (ndat % threads.x)
+        blocks.x ++;
+
+      float2* base_in = (float2*) input->get_dattfp ();
+      float* base_out = output->get_dattfp();
+
+      unsigned stride_in = nchan * npol;
+      unsigned stride_out = nchan * npol;
+
+      if (dsp::Operation::verbose)
+        cerr << "CUDA::DetectionEngine::square_law sqld_tfp ndat=" << ndat
+             << " stride_in=" << stride_in << " stride_out=" << stride_out << endl;
+
+      sqld_tfp<<<blocks,threads,0,stream>>> (base_in, stride_in, base_out, stride_out, ndat);
+
+      if (dsp::Operation::record_time || dsp::Operation::verbose)
+        check_error_stream ("CUDA::DetectionEngine::square_law sqld_tfp", stream);
+
+      break;
+    }
+
+    case dsp::TimeSeries::OrderFPT:
+    {
+      dim3 threads (512);
+      dim3 blocks (ndat/threads.x, nchan);
+
+      if (ndat % threads.x)
+        blocks.x ++;
+
+      unsigned ichan = 0;
+      unsigned ipol = 0;
+      float2* base_in = (float2*) input->get_datptr(ichan, ipol);
+      float* base_out = output->get_datptr(ichan, ipol);
+
+      if (dsp::Operation::verbose)
+        cerr << "CUDA::DetectionEngine::square_law <<<sqld_fpt>>> "
+             << " base_in=" << (void *) base_in
+             << " base_out=" << (void *) base_out
+             << " ndat=" << ndat << endl;
+
+      sqld_fpt<<<blocks,threads,0,stream>>> (base_in, base_out, ndat);
+
+      if (dsp::Operation::record_time || dsp::Operation::verbose)
+        check_error_stream ("CUDA::DetectionEngine::square_law sqld_fpt", stream);
+
+      break;
+    }
+
+    default:
+    {
+      throw Error (InvalidState, "CUDA::DetectionEngine::square_law", "unrecognized order");
+    }
+  }
+}
diff --git a/Signal/General/Filterbank.C b/Signal/General/Filterbank.C
index d9935173..3e751af8 100644
--- a/Signal/General/Filterbank.C
+++ b/Signal/General/Filterbank.C
@@ -205,21 +205,6 @@ void dsp::Filterbank::make_preparations ()
                      "matrix convolution and input.npol != 2");
   }
 
-  if (passband)
-  {
-    if (response)
-      passband -> match (response);
-
-    unsigned passband_npol = input->get_npol();
-    if (matrix_convolution)
-      passband_npol = 4;
-
-    passband->resize (passband_npol, input->get_nchan(), n_fft, 1);
-
-    if (!response)
-      passband->match (input);
-  }
-
   if (has_buffering_policy())
   {
     if (verbose)
@@ -239,6 +224,22 @@ void dsp::Filterbank::make_preparations ()
     return;
   }
 
+  // the engine should delete the passband if it doesn't support this feature
+  if (passband)
+  {
+    if (response)
+      passband -> match (response);
+
+    unsigned passband_npol = input->get_npol();
+    if (matrix_convolution)
+      passband_npol = 4;
+
+    passband->resize (passband_npol, input->get_nchan(), n_fft, 1);
+
+    if (!response)
+      passband->match (input);
+  }
+
   using namespace FTransform;
 
   OptimalFFT* optimal = 0;
diff --git a/Signal/General/FilterbankCUDA.cu b/Signal/General/FilterbankCUDA.cu
index 4fab005c..9986105e 100644
--- a/Signal/General/FilterbankCUDA.cu
+++ b/Signal/General/FilterbankCUDA.cu
@@ -77,6 +77,9 @@ void CUDA::FilterbankEngine::setup (dsp::Filterbank* filterbank)
   float2** d_kernel_ptr = reinterpret_cast<float2**>(filterbank->get_d_kernel_gpu_ptr());
   d_kernel = *d_kernel_ptr;
   
+  // the CUDA engine does not maintain/compute the passband
+  filterbank->set_passband (NULL);
+  
   freq_res = filterbank->get_freq_res ();
   nchan_subband = filterbank->get_nchan_subband();
 
diff --git a/Signal/General/SingleThread.C b/Signal/General/SingleThread.C
index 6f74de32..6af57a36 100644
--- a/Signal/General/SingleThread.C
+++ b/Signal/General/SingleThread.C
@@ -284,16 +284,16 @@ void dsp::SingleThread::construct () try
 
       unpacked->set_memory (new CUDA::PinnedMemory);
 
-      TransferCUDA* transfer;
+      TransferCUDA* transfer = new TransferCUDA (stream);
       if (config->use_input_stream)
       {
+        if (Operation::verbose)
+          cerr << "SingleThread: setting input stream" << endl;
         // Create an event that signals the completion of the CUDA transfer
         cudaEventCreate( reinterpret_cast<cudaEvent_t*>(&input_event) );
-        transfer = new TransferCUDA (stream,
-          static_cast<cudaStream_t>(input_stream), static_cast<cudaEvent_t>(input_event));
+        transfer->set_input_stream(static_cast<cudaStream_t>(input_stream),
+                                   static_cast<cudaEvent_t>(input_event));
       }
-      else
-        transfer = new TransferCUDA (stream);
       transfer->set_kind( cudaMemcpyHostToDevice );
       transfer->set_input( unpacked );
 
diff --git a/Signal/General/TransferCUDA.C b/Signal/General/TransferCUDA.C
index c5de21c2..98023b68 100644
--- a/Signal/General/TransferCUDA.C
+++ b/Signal/General/TransferCUDA.C
@@ -19,17 +19,7 @@ dsp::TransferCUDA::TransferCUDA(cudaStream_t _stream)
   stream = _stream;
   input_stream = _stream;
   kind = cudaMemcpyHostToDevice;
-}
-
-//! Associate cudaEvent with the transfer
-dsp::TransferCUDA::TransferCUDA(cudaStream_t _stream,
-                                cudaStream_t _input_stream, cudaEvent_t _event)
-  : Transformation<TimeSeries,TimeSeries> ("CUDA::Transfer", outofplace)
-{
-  stream = _stream;
-  input_stream = _input_stream;
-  kind = cudaMemcpyHostToDevice;
-  event = _event;
+  event = 0;
 }
 
 //! Do stuff
@@ -62,7 +52,8 @@ void dsp::TransferCUDA::transformation ()
                              input->internal_get_size(),
                              kind,
                              input_stream);
-    cudaEventRecord(event, input_stream);
+    if (event)
+      cudaEventRecord(event, input_stream);
   }
   else
     error = cudaMemcpy (output->internal_get_buffer(),
@@ -89,3 +80,9 @@ void dsp::TransferCUDA::prepare ()
   output->internal_match( input );
   output->copy_configuration( input );
 }
+
+void dsp::TransferCUDA::set_input_stream (cudaStream_t _input_stream, cudaEvent_t _event)
+{
+  input_stream = _input_stream;
+  event = _event;
+}
diff --git a/Signal/General/dsp/Detection.h b/Signal/General/dsp/Detection.h
index 092b410b..c5d09e46 100644
--- a/Signal/General/dsp/Detection.h
+++ b/Signal/General/dsp/Detection.h
@@ -103,6 +103,9 @@ namespace dsp {
   public:
     virtual void polarimetry (unsigned ndim,
 			      const TimeSeries* in, TimeSeries* out) = 0;
+
+    virtual void square_law (const dsp::TimeSeries* input,
+            dsp::TimeSeries* output) = 0;
   }; 
 }
 
diff --git a/Signal/General/dsp/DetectionCUDA.h b/Signal/General/dsp/DetectionCUDA.h
index 8f1a53d1..d2d8894a 100644
--- a/Signal/General/dsp/DetectionCUDA.h
+++ b/Signal/General/dsp/DetectionCUDA.h
@@ -28,6 +28,8 @@ namespace CUDA
     void polarimetry (unsigned ndim,
                       const dsp::TimeSeries* in, dsp::TimeSeries* out);
 
+    void square_law (const dsp::TimeSeries* in, dsp::TimeSeries* out);
+
   protected:
     cudaStream_t stream;
 
diff --git a/Signal/General/dsp/TransferCUDA.h b/Signal/General/dsp/TransferCUDA.h
index bdc3506c..67f8a59b 100644
--- a/Signal/General/dsp/TransferCUDA.h
+++ b/Signal/General/dsp/TransferCUDA.h
@@ -23,13 +23,13 @@ namespace dsp {
     //! Default constructor - always out of place
     TransferCUDA(cudaStream_t _stream);
 
-    //! Constructor with input stream and completion event
-    TransferCUDA(cudaStream_t _stream, cudaStream_t _input_stream,
-                 cudaEvent_t _event);
-
     void set_kind (cudaMemcpyKind k) { kind = k; }
     void prepare ();
 
+    // If transferring all input in its own stream, need the stream and an event
+    // signaling transfer completion
+    void set_input_stream (cudaStream_t _input_stream, cudaEvent_t _event);
+
     Operation::Function get_function () const { return Operation::Structural; }
 
   protected:
diff --git a/Signal/Pulsar/FoldCUDA.cu b/Signal/Pulsar/FoldCUDA.cu
index 23d36628..342b8551 100644
--- a/Signal/Pulsar/FoldCUDA.cu
+++ b/Signal/Pulsar/FoldCUDA.cu
@@ -177,8 +177,11 @@ void CUDA::FoldEngine::send_binplan ()
   cudaError error;
 
   if (stream)
+  {
     error = cudaMemcpyAsync (d_bin, binplan, mem_size,
 			     cudaMemcpyHostToDevice, stream);
+    cudaStreamSynchronize(stream);
+  }
   else
     error = cudaMemcpy (d_bin, binplan, mem_size, cudaMemcpyHostToDevice);
 
diff --git a/Signal/Pulsar/LoadToFold1.C b/Signal/Pulsar/LoadToFold1.C
index a3e45bb1..4ac12c51 100644
--- a/Signal/Pulsar/LoadToFold1.C
+++ b/Signal/Pulsar/LoadToFold1.C
@@ -689,6 +689,30 @@ void dsp::LoadToFold::prepare_fold ()
   fold_prepared = true;
 }
 
+MJD dsp::LoadToFold::parse_epoch (const std::string& epoch_string)
+{
+  MJD epoch;
+
+  if (epoch_string == "start")
+  {
+    epoch = manager->get_info()->get_start_time();
+    epoch += manager->get_input()->tell_seconds();
+
+    if (Operation::verbose)
+      cerr << "dsp::LoadToFold::parse reference epoch=start_time=" 
+	   << epoch.printdays(13) << endl;
+  }
+  else if (!epoch_string.empty())
+  {
+    epoch = MJD( epoch_string );
+    if (Operation::verbose)
+      cerr << "dsp::LoadToFold::parse reference epoch="
+	   << epoch.printdays(13) << endl;
+  }
+
+  return epoch;
+}
+
 void dsp::LoadToFold::prepare ()
 {
   assert (fold.size() > 0);
@@ -759,24 +783,7 @@ void dsp::LoadToFold::prepare ()
       excision -> set_cutoff_sigma ( config->excision_cutoff );
   }
 
-  MJD reference_epoch;
-
-  if (config->reference_epoch == "start")
-  {
-    reference_epoch = manager->get_info()->get_start_time();
-    reference_epoch += manager->get_input()->tell_seconds();
-
-    if (Operation::verbose)
-      cerr << "dsp::LoadToFold::prepare reference epoch=start_time=" 
-	   << reference_epoch.printdays(13) << endl;
-  }
-  else if (!config->reference_epoch.empty())
-  {
-    reference_epoch = MJD( config->reference_epoch );
-    if (Operation::verbose)
-      cerr << "dsp::LoadToFold::prepare reference epoch="
-	   << reference_epoch.printdays(13) << endl;
-  }
+  MJD fold_reference_epoch = parse_epoch (config->reference_epoch);
 
   for (unsigned ifold=0; ifold < fold.size(); ifold++)
   {
@@ -791,7 +798,7 @@ void dsp::LoadToFold::prepare ()
       fold[ifold]->get_output()->set_extensions (extensions);
     }
 
-    fold[ifold]->set_reference_epoch (reference_epoch);
+    fold[ifold]->set_reference_epoch (fold_reference_epoch);
   }
 
   SingleThread::prepare ();
@@ -1064,6 +1071,9 @@ void dsp::LoadToFold::build_fold (Reference::To<Fold>& fold,
 
       if (config->minimum_integration_length > 0)
 	unloader->set_minimum_integration_length (config->minimum_integration_length);
+
+      MJD reference_epoch = parse_epoch (config->integration_reference_epoch);
+      subfold -> set_subint_reference_epoch( reference_epoch );
     }
     else
     {
@@ -1101,7 +1111,8 @@ catch (Error& error)
   throw error += "dsp::LoadToFold::build_fold";
 }
 
-void dsp::LoadToFold::configure_detection (Detection* detect, int noperations)
+void dsp::LoadToFold::configure_detection (Detection* detect,
+					   unsigned noperations)
 {
 #if HAVE_CUDA
   bool run_on_gpu = thread_id < config->get_cuda_ndevice()
diff --git a/Signal/Pulsar/TimeDivide.C b/Signal/Pulsar/TimeDivide.C
index 86285bde..1bc2a363 100644
--- a/Signal/Pulsar/TimeDivide.C
+++ b/Signal/Pulsar/TimeDivide.C
@@ -58,8 +58,15 @@ void dsp::TimeDivide::set_start_time (MJD _start_time)
   start_phase = Phase::zero;
   is_valid = false;
 
-  if( division_seconds && division_seconds == unsigned(division_seconds) &&
-      integer_division_seconds_boundaries)
+  if( reference_epoch != MJD::zero )
+  {
+    if (Operation::verbose)
+      cerr << "dsp::TimeDivide::set_start_time set to reference_epoch=" 
+           << reference_epoch.printall() << endl;
+
+    start_time = reference_epoch;
+  }
+  else if( division_seconds && division_seconds == unsigned(division_seconds) )
   {
     unsigned integer_seconds = unsigned(division_seconds);
     unsigned seconds = start_time.get_secs();
@@ -98,9 +105,11 @@ void dsp::TimeDivide::set_predictor (const Pulsar::Predictor* _poly)
     return;
 
   poly = _poly;
+  period = 0.0;
   division_seconds = 0;
 }
 
+
 //! Set the reference phase (phase of bin zero)
 void dsp::TimeDivide::set_reference_phase (double phase)
 {
@@ -455,10 +464,10 @@ void dsp::TimeDivide::set_boundaries (const MJD& input_start)
 
   // division length specified in turns
 
-  if (!poly)
+  if (!poly && period == 0.0)
     throw Error (InvalidState, "dsp::TimeDivide::set_boundaries",
 		 "division length specified in turns "
-		 "but no folding Pulsar::Predictor");
+		 "but no folding predictor or period");
 
   if (Operation::verbose)
     cerr << "dsp::TimeDivide::set_boundaries using polynomial:\n"
diff --git a/Signal/Pulsar/dsp/LoadToFold1.h b/Signal/Pulsar/dsp/LoadToFold1.h
index aaf33688..03d6b37c 100644
--- a/Signal/Pulsar/dsp/LoadToFold1.h
+++ b/Signal/Pulsar/dsp/LoadToFold1.h
@@ -150,7 +150,7 @@ namespace dsp {
     void build_fold (TimeSeries*);
     void build_fold (Reference::To<Fold>&, PhaseSeriesUnloader*);
     void configure_fold (unsigned ifold, TimeSeries* to_fold);
-    void configure_detection (Detection*, int);
+    void configure_detection (Detection*, unsigned);
 
     PhaseSeriesUnloader* get_unloader (unsigned ifold);
     size_t get_nfold ();
@@ -162,6 +162,8 @@ namespace dsp {
     //! Prepare the given Archiver
     void prepare_archiver (Archiver*);
 
+    //! Parse the epoch string into a reference epoch
+    MJD parse_epoch (const std::string&);
   };
 
 }
diff --git a/Signal/Pulsar/dsp/LoadToFoldConfig.h b/Signal/Pulsar/dsp/LoadToFoldConfig.h
index d105c559..4a81436a 100644
--- a/Signal/Pulsar/dsp/LoadToFoldConfig.h
+++ b/Signal/Pulsar/dsp/LoadToFoldConfig.h
@@ -151,6 +151,9 @@ namespace dsp {
     // length of sub-integrations in seconds
     double integration_length;
 
+    // reference epoch = start of first sub-integration
+    std::string integration_reference_epoch;
+
     // minimum sub-integration length written to disk
     double minimum_integration_length;
 
diff --git a/Signal/Pulsar/dsp/Subint.h b/Signal/Pulsar/dsp/Subint.h
index 1aac4873..4a7acc70 100644
--- a/Signal/Pulsar/dsp/Subint.h
+++ b/Signal/Pulsar/dsp/Subint.h
@@ -75,6 +75,14 @@ namespace dsp {
     //! Get the number of seconds to fold into each sub-integration
     double get_subint_seconds () const { return divider.get_seconds(); }
 
+    //! Set the start time of the first sub-integration
+    void set_subint_reference_epoch (const MJD& epoch)
+    { divider.set_reference_epoch (epoch); }
+
+    //! Get the start time of the first sub-integration
+    MJD get_subint_reference_epoch () const
+    { return divider.get_reference_epoch (); }
+
     //! Set the number of turns to fold into each sub-integration
     void set_subint_turns (unsigned subint_turns)
       { divider.set_turns (subint_turns); }
diff --git a/Signal/Pulsar/dsp/TimeDivide.h b/Signal/Pulsar/dsp/TimeDivide.h
index c42cfa65..03adbd68 100644
--- a/Signal/Pulsar/dsp/TimeDivide.h
+++ b/Signal/Pulsar/dsp/TimeDivide.h
@@ -49,6 +49,12 @@ namespace dsp {
     //! Get the number of seconds in each division
     double get_seconds () const { return division_seconds; }
 
+    //! Set the reference epoch (start time of first division)
+    void set_reference_epoch (const MJD& epoch) { reference_epoch = epoch; }
+
+    //! Set the reference epoch (start time of first division)
+    MJD get_reference_epoch () const { return reference_epoch; }
+
     //! Set the number of turns in each division
     void set_turns (double division_turns);
 
@@ -61,6 +67,12 @@ namespace dsp {
     //! Get the Pulsar::Predictor used to determine pulse phase
     const Pulsar::Predictor* get_predictor () const { return poly; }
 
+    //! Set the folding period used to determine pulse phase
+    void set_period (double);
+
+    //! Set the folding period used to determine pulse phase
+    double get_period () const { return period; }
+
     //! Set the reference phase (phase of bin zero)
     void set_reference_phase (double phase);
 
@@ -126,6 +138,9 @@ namespace dsp {
     //! Number of seconds in each division
     double division_seconds;
 
+    //! Reference epoch at start of the first division
+    MJD reference_epoch;
+
     //! Number of turns in each division
     double division_turns;
 
@@ -138,6 +153,9 @@ namespace dsp {
     //! Round division boundaries to integer numbers of division_seconds
     bool integer_division_seconds_boundaries;
 
+    //! The period used to determine pulse phase
+    double period;
+
     //! The Pulsar::Predictor used to determine pulse phase
     Reference::To<const Pulsar::Predictor> poly;
 
diff --git a/Signal/Pulsar/dspsr.C b/Signal/Pulsar/dspsr.C
index e0c8ac28..73f2d0b6 100644
--- a/Signal/Pulsar/dspsr.C
+++ b/Signal/Pulsar/dspsr.C
@@ -436,6 +436,9 @@ void parse_options (int argc, char** argv) try
   arg = menu.add (config->integration_length, 'L', "seconds");
   arg->set_help ("create integrations of specified duration");
 
+  arg = menu.add (config->integration_reference_epoch, "Lepoch", "MJD");
+  arg->set_help ("start time of first sub-integration (when -L is used)");
+
   arg = menu.add (config->minimum_integration_length, "Lmin", "seconds");
   arg->set_help ("minimum integration length output");
 
diff --git a/configure.in b/configure.in
index 32df2ca2..33c6ef51 100644
--- a/configure.in
+++ b/configure.in
@@ -164,6 +164,7 @@ AC_CONFIG_FILES([Makefile
         	Kernel/Formats/fits/Makefile
 		Kernel/Formats/gmrt/Makefile
 		Kernel/Formats/guppi/Makefile
+		Kernel/Formats/kat/Makefile
 		Kernel/Formats/lbadr/Makefile
 		Kernel/Formats/lbadr64/Makefile
 		Kernel/Formats/lofar_dal/Makefile