/*
 * zofits.h
 *
 *  FACT native compressed FITS writer
 *      Author: lyard
 */

#include "ofits.h"
#include "Queue.h"
#include "MemoryManager.h"

#ifndef __MARS__
namespace std
{
#else
using namespace std;
#endif

class zofits : public ofits
{
    public:

        //This has been duplicated from zfits. Should be be located one level up ?
        //If so, where ?
        enum CompressionProcess_t
        {
            kFactRaw       = 0x0,
            kFactSmoothing = 0x1,
            kFactHuffman16 = 0x2
        };

        enum RowOrdering_t
        {
            kOrderByCol = 'C',
            kOrderByRow = 'R'
        };

        //TileHeaders are only written, but never read-back
        //They are here to be able to recover raw data from binary if the header is corrupted
        //Or to cross-check the data, if desired: the zfits method CheckIfFileIsConsistent can do this
        struct TileHeader
        {
          char     id[4];
          uint32_t numRows;
          uint64_t size;
          TileHeader(uint32_t nRows=0,
                     uint64_t s=0) : id({'T', 'I', 'L', 'E'}),
                                     numRows(nRows),
                                     size(s)
          { };
        } __attribute__((__packed__));

        //BlockHeaders are written before every compressed blob of data
        struct BlockHeader
        {
            uint64_t      size;
            char          ordering;
            unsigned char numProcs;
            BlockHeader(uint64_t      s=0,
                        char          o=zfits::kOrderByRow,
                        unsigned char n=1) : size(s),
                                             ordering(o),
                                             numProcs(n)
            {}
        } __attribute__((__packed__)) ;


        struct WriteTarget
        {
            bool operator < (const WriteTarget& other)
            {
                tile_num < other.tile_num;
            }
            uint32_t tile_num;
            uint32_t size;
            shared_ptr<MemoryChunk> target;
        };

        struct CompressionTarget
        {
            bool operator < (const CompressionTarget& other)
            {
                return target < other.target;
            }
            shared_ptr<MemoryChunk> src;
            WriteTarget             target;
            uint32_t                num_rows;
        };


        //constructors
        zofits(uint32_t numTiles=1000,
               uint32_t rowPerTile=100,
               uint64_t maxUsableMem=0) : ofits(),
                                          fMemPool(0, maxUsableMem),
                                          fWriteToDiskQueue(bind(&zofits::WriteBufferToDisk, this, placeholders::_1), true)
        {
            InitMemberVariables(numTiles, rowPerTile, maxUsableMem);
            SetNumWorkingThreads(1);
        }

        zofits(const char* fname,
               uint32_t numTiles=1000,
               uint32_t rowPerTile=100,
               uint64_t maxUsableMem=0) : ofits(fname),
                                          fMemPool(0, maxUsableMem),
                                          fWriteToDiskQueue(bind(&zofits::WriteBufferToDisk, this, placeholders::_1), true)
        {
            InitMemberVariables(numTiles, rowPerTile, maxUsableMem);
            SetNumWorkingThreads(1);
        }

        ~zofits()
        {
        }

        //initialization of member variables
        void InitMemberVariables(uint32_t nt=0, uint32_t rpt=0, uint64_t maxUsableMem=0)
        {
            fCheckOffset  = 0;

            fNumTiles       = nt;
            fNumRowsPerTile = rpt;

            fNumQueues   = 0;
            fQueueLooper = 0;

            fBuffer       = NULL;
            fRealRowWidth = 0;

            fCatalogOffset    =  0;
            fStartCellsOffset = -1;
            fDataOffset       = -1;

            fMaxUsableMem = maxUsableMem;
        }

        //whether or not a calibration was given to the file writer
        bool IsOffsetCalibrated()
        {
            return (fOffsetCalibration.size() != 0);
        }

        //assign a given drs offset calibration
        void SetDrsCalibration(const float* calib)
        {
            if (!IsOffsetCalibrated())
                fOffsetCalibration.resize(1440*1024);

            for (uint32_t i=0;i<1440*1024;i++)
                fOffsetCalibration[i] = (int16_t)(calib[i]*4096.f/2000.f);
        }

        void SetDrsCalibration(const vector<float>& calib)
        {
            if (calib.size() != 1440*1024)
#ifdef __EXCEPTIONS
            throw runtime_error("Cannot load calibration with anything else than 1024 samples per pixel");
#else
            gLog << ___err___ << "ERROR - Cannot load calibration with anything else than 1024 samples per pixel");
#endif
            SetDrsCalibration(calib.data());
        }

        void LoadDrsCalibrationFromFile(const string& fileName)
        {
            factfits drsFile(fileName);
            float* drsCalibFloat  = reinterpret_cast<float*>(drsFile.SetPtrAddress("BaselineMean"));

            drsFile.GetNextRow();

            SetDrsCalibration(drsCalibFloat);
        }

        //write the header of the binary table
        bool WriteTableHeader(const char* name="DATA")
        {
            if (!reallocateBuffers())
                throw ("While allocating memory: apparently there not as much free memory as advertized...");

            ofits::WriteTableHeader(name);

            //start the compression queues
            for (auto it=fCompressionQueues.begin(); it!= fCompressionQueues.end(); it++)
                it->start();

            //mark that no tile has been written so far
            fLatestWrittenTile = -1;

            if (IsOffsetCalibrated())
            {//retrieve the column storing the start cell offsets, if required.

                for (auto it=fRealColumns.begin(); it!=fRealColumns.end(); it++)//Table.cols.begin(); it!= fTable.cols.end(); it++)
                {
                    if (it->col.name == "StartCellData")
                        fStartCellsOffset = it->col.offset;
                    if (it->col.name == "Data")
                    {
                        fNumSlices = it->col.num;
                        fDataOffset = it->col.offset;
                        if (fNumSlices % 1440 != 0)
                        {
#ifdef __EXCEPTIONS
                            throw runtime_error("Number of data samples not a multiple of 1440.");
#else
                            gLog << ___err___ << "ERROR - Number of data samples not a multiple of 1440. Doing it uncalibrated." << endl;
#endif
                            fOffsetCalibration.resize(0);
                        }
                        fNumSlices /= 1440;
                    }
                }
                if (fStartCellsOffset < 0)
                {
#ifdef __EXCEPTIONS
                    throw runtime_error("FACT Calibration requested, but \"StartCellData\" column not found.");
#else
                    gLog << ___err___ << "ERROR - FACT Calibration requested, but \"StartCellData\" column not found. Doing it uncalibrated." << endl;
#endif
                    //throw away the calibration data
                    fOffsetCalibration.resize(0);
                }
                if (fDataOffset < 0)
                {
#ifdef __EXCEPTIONS
                    throw runtime_error("FACT Calibration requested, but \"Data\" column not found.");
#else
                    gLog << ___err___ << "ERROR - FACT Calibration requested, but \"Data\" column not found. Doing it uncalibrated." << endl;
#endif
                    //throw away the calibration data
                    fOffsetCalibration.resize(0);
                }
            }
        }

        void open(const char* filename, bool addEXTNAMEKey=true)
        {
            ofits::open(filename, addEXTNAMEKey);

            //add compression-related header entries
            SetBool("ZTABLE", true, "Table is compressed");
            SetInt("ZNAXIS1", 0, "Width of uncompressed rows");
            SetInt("ZNAXIS2", 0, "Number of uncompressed rows");
            SetInt("ZPCOUNT", 0, "");
            SetInt("ZHEAPPTR", 0, "");
            SetInt("ZTILELEN", fNumRowsPerTile, "Number of rows per tile");
            SetInt("THEAP", 0, "");
            SetStr("RAWSUM", "         0", "Checksum of raw littlen endian data");


            fRawSum.reset();
        }

        bool WriteDrsOffsetsTable()
        {
            if (!IsOffsetCalibrated())
                return false;

            ofits c;
            c.SetStr("XTENSION", "BINTABLE"            , "binary table extension");
            c.SetInt("BITPIX"  , 8                     , "8-bit bytes");
            c.SetInt("NAXIS"   , 2                     , "2-dimensional binary table");
            c.SetInt("NAXIS1"  , 1024*1440*2           , "width of table in bytes");
            c.SetInt("NAXIS2"  , 1                     , "number of rows in table");
            c.SetInt("PCOUNT"  , 0                     , "size of special data area");
            c.SetInt("GCOUNT"  , 1                     , "one data group (required keyword)");
            c.SetInt("TFIELDS" , 1                     , "number of fields in each row");
            c.SetStr("CHECKSUM", "0000000000000000"    , "Checksum for the whole HDU");
            c.SetStr("DATASUM" ,  "         0"         , "Checksum for the data block");
            c.SetStr("EXTNAME" , "ZDrsCellOffsets"     , "name of this binary table extension");
            c.SetStr("TTYPE1"  , "OffsetCalibration"   , "label for field   1");
            c.SetStr("TFORM1"  , "1474560I"            , "data format of field: 2-byte INTEGER");
            c.End();

            vector<char> swappedOffsets;
            swappedOffsets.resize(1024*1440*sizeof(int16_t));
            revcpy<sizeof(int16_t)>(swappedOffsets.data(), (char*)(fOffsetCalibration.data()), 1024*1440);

            Checksum datasum;
            datasum.add(swappedOffsets.data(), sizeof(int16_t)*1024*1440);

            ostringstream dataSumStr;
            dataSumStr << datasum.val();
            c.SetStr("DATASUM", dataSumStr.str());

            datasum += c.WriteHeader(*this);

            const off_t here_I_am = tellp();

            c.SetStr("CHECKSUM", datasum.str());
            c.WriteHeader(*this);

            seekp(here_I_am);

            write(swappedOffsets.data(), swappedOffsets.size());

            AlignTo2880Bytes();

            return good();
        }

        uint32_t GetBytesPerRow() const
        {
            return fRealRowWidth;
        }

        bool WriteCatalog()
        {
            const uint32_t one_catalog_row_size = fTable.num_cols*2*sizeof(uint64_t);
            const uint32_t total_catalog_size = fCatalog.size()*one_catalog_row_size;

            vector<char> swapped_catalog(total_catalog_size);
            uint32_t shift = 0;
            for (auto it=fCatalog.begin(); it!=fCatalog.end(); it++)
            {
                revcpy<sizeof(uint64_t)>(swapped_catalog.data() + shift, (char*)(it->data()), fTable.num_cols*2);
                shift += one_catalog_row_size;
            }

            if (fCatalogOffset == 0)
            {
                fCatalogOffset = tellp();
            }

            const off_t where_are_we = tellp();

            seekp(fCatalogOffset);
            write(swapped_catalog.data(), total_catalog_size);
            if (where_are_we != fCatalogOffset)
                seekp(where_are_we);

            fCatalogSum.reset();
            fCatalogSum.add(swapped_catalog.data(), total_catalog_size);

            return good();
        }

        bool WriteRow(const void* ptr, size_t cnt, bool byte_swap=true)
        {
            if (cnt != fRealRowWidth)
            {
#ifdef __EXCEPTIONS
                throw runtime_error("Wrong size of row given to WriteRow");
#else
                gLog << ___err___ << "ERROR - Wrong size of row given to WriteRow" << endl;
                return false;
#endif
            }

            if (fTable.num_rows >= fNumRowsPerTile*fNumTiles)
            {
#ifdef __EXCEPTIONS
                throw runtime_error("Maximum number of rows exceeded for this file");
#else
                gLog << ___err___ << "ERROR - Maximum number of rows exceeded for this file" << endl;
                return false;
#endif
            }

            //copy current row to pool or rows waiting for compression
            char* target_location = fBuffer + fRealRowWidth*(fTable.num_rows%fNumRowsPerTile);
            memcpy(target_location, ptr, fRealRowWidth);

            //for now, make an extra copy of the data, for RAWSUM checksuming.
            //Ideally this should be moved to the threads, along with the drs-offset-calibration
            //However, because the RAWSUM must be calculated before the tile is transposed, I am not sure whether
            //one extra memcpy per row written is worse than 100 rows checksumed when the tile is full....
            const uint32_t rawOffset = (fTable.num_rows*fRealRowWidth)%4;
            char* buffer = fRawSumBuffer.data() + rawOffset;
            auto ib = fRawSumBuffer.begin();
            auto ie = fRawSumBuffer.rbegin();
            *ib++ = 0;
            *ib++ = 0;
            *ib++ = 0;
            *ib   = 0;

            *ie++ = 0;
            *ie++ = 0;
            *ie++ = 0;
            *ie   = 0;

            memcpy(buffer, ptr, fRealRowWidth);

            fRawSum.add(fRawSumBuffer, false);

            if (IsOffsetCalibrated())
            {

                int16_t* startCell = reinterpret_cast<int16_t*>(target_location + fStartCellsOffset);
                int16_t* data      = reinterpret_cast<int16_t*>(target_location + fDataOffset);

                for (uint32_t ch=0; ch<1440; ch++)
                {
                    if (startCell[ch] < 0)
                    {
                        data += fNumSlices;
                        continue;
                    }

                    const int16_t modStart = startCell[ch]%1024;
                    const int16_t *off     = fOffsetCalibration.data() + ch*1024;

                    const int16_t* cal        = off+modStart;
                    const int16_t* end_stride = data+fNumSlices;

                    if (modStart+fNumSlices > 1024)
                    {
                        while (cal < off+1024)
                            *data++ -= *cal++;
                        cal = off;
                    }

                    while (data<end_stride)
                        *data++ -= *cal++;
                }
            }

            fTable.num_rows++;

            if (fTable.num_rows % fNumRowsPerTile == 0)
            {
                CompressionTarget compress_target;
                SetNextCompression(compress_target);

                if (!fCompressionQueues[fQueueLooper].post(compress_target))
                    throw runtime_error("I could not post this buffer. This does not make sense...");

                fQueueLooper = (fQueueLooper+1)%fNumQueues;
            }

            return true;
        }

        void FlushNumRows()
        {
            SetInt("NAXIS2", fTable.num_rows/fNumRowsPerTile);
            SetInt("ZNAXIS2", fTable.num_rows);
            FlushHeader();
        }

        void SetNextCompression(CompressionTarget& target)
        {
            shared_ptr<MemoryChunk> transposed_data = fMemPool.malloc();

            copyTransposeTile(fBuffer, transposed_data.get()->get());

            WriteTarget write_target;
            write_target.tile_num = (fTable.num_rows-1)/fNumRowsPerTile;
            write_target.size     = 0;
            write_target.target   = fMemPool.malloc();

            target.src      = transposed_data;
            target.target   = write_target;
            target.num_rows = fTable.num_rows;
        }

        bool close()
        {
            if (tellp() < 0)
                return false;

            for (auto it=fCompressionQueues.begin(); it != fCompressionQueues.end(); it++)
                it->wait();

            fWriteToDiskQueue.wait();

            if (fTable.num_rows%fNumRowsPerTile != 0)
            {
                CompressionTarget compress_target;
                SetNextCompression(compress_target);

                uint64_t size_to_write = CompressBuffer(compress_target);

                WriteTarget write_target;
                write_target.size     = size_to_write;
                write_target.target   = compress_target.target.target;
                write_target.tile_num = compress_target.target.tile_num;

                if (!WriteBufferToDisk(write_target))
                    throw runtime_error("Something went wrong while writing the last tile...");
            }

            AlignTo2880Bytes();

            //update header keywords
            SetInt("ZNAXIS1", fRealRowWidth);
            SetInt("ZNAXIS2", fTable.num_rows);

            uint64_t heap_offset = fCatalog.size()*fTable.num_cols*sizeof(uint64_t)*2;
            SetInt("ZHEAPPTR", heap_offset);

            const uint32_t total_num_tiles_written = (fTable.num_rows + fNumRowsPerTile-1)/fNumRowsPerTile;

            SetInt("THEAP", total_num_tiles_written*2*sizeof(int64_t)*fTable.num_cols);

            SetInt("NAXIS1", 2*sizeof(int64_t)*fTable.num_cols);
            SetInt("NAXIS2", total_num_tiles_written);

            ostringstream str;
            str << fRawSum.val();
            SetStr("RAWSUM", str.str());

            int64_t heap_size = 0;
            int64_t compressed_offset = 0;

            for (uint32_t i=0; i<total_num_tiles_written; i++)
            {
                compressed_offset += sizeof(TileHeader);
                heap_size         += sizeof(TileHeader);
                for (uint32_t j=0; j<fCatalog[i].size(); j++)
                {
                    heap_size += fCatalog[i][j].first;
                    fCatalog[i][j].second = compressed_offset;
                    compressed_offset += fCatalog[i][j].first;
                    if (fCatalog[i][j].first == 0)
                        fCatalog[i][j].second = 0;
                }
            }

            //add to the heap size the size of the gap between the catalog and the actual heap
            heap_size += (fCatalog.size() - total_num_tiles_written)*fTable.num_cols*sizeof(uint64_t)*2;

            SetInt("PCOUNT", heap_size, "size of special data area");

            //Just for updating the fCatalogSum value
            WriteCatalog();

            fDataSum += fCatalogSum;

            const Checksum checksm = UpdateHeaderChecksum();

            ofstream::close();

            if ((checksm+fDataSum).valid())
                return true;

            ostringstream sout;
            sout << "Checksum (" << std::hex << checksm.val() << ") invalid.";
#ifdef __EXCEPTIONS
            throw runtime_error(sout.str());
#else
            gLog << ___err___ << "ERROR - " << sout.str() << endl;
            return false;
#endif
        }

        //Overload of the ofits method. Just calls the zofits specific one with default, uncompressed options for this column
        bool AddColumn(uint32_t cnt, char typechar, const string& name, const string& unit, const string& comment="", bool addHeaderKeys=true)
        {
            BlockHeader head;
            vector<uint16_t> processing(1);
            processing[0] = kFactRaw;
            AddColumn(cnt, typechar, name, unit, head, processing, comment, addHeaderKeys);
        }

        bool AddColumn(uint32_t cnt, char typechar, const string& name, const string& unit, BlockHeader& header, vector<uint16_t>& comp_sequence, const string& comment="", bool addHeaderKeys=true)
        {
            if (!ofits::AddColumn(1, 'Q', name, unit, comment, addHeaderKeys))
                return false;

            Table::Column col;
            size_t size = SizeFromType(typechar);

            col.name   = name;
            col.type   = typechar;
            col.num    = cnt;
            col.size   = size;
            col.offset = fRealRowWidth;

            fRealRowWidth += size*cnt;

            fRealColumns.emplace_back(CompressedColumn(col, header, comp_sequence));

            ostringstream strKey, strVal, strCom;
            strKey << "ZFORM" << fRealColumns.size();
            strVal << cnt << typechar;
            strCom << "format of " << name << " [" << CommentFromType(typechar);
            SetStr(strKey.str(), strVal.str(), strCom.str());

            strKey.str("");
            strVal.str("");
            strCom.str("");
            strKey << "ZCTYP" << fRealColumns.size();
            strVal << "FACT";
            strCom << "Comp. of FACT telescope";
            SetStr(strKey.str(), strVal.str(), strCom.str());

            return true;
        }

        bool SetNumWorkingThreads(uint32_t num)
        {
            if (is_open())
            {
#ifdef __EXCEPTIONS
                throw runtime_error("File must be closed before changing the number of compression threads");
#else
                gLog << ___err___ << "ERROR - File must be closed before changing the number of compression threads");
#endif
                return false;
            }
            if (num < 1 || num > 64)
            {
#ifdef __EXCEPTIONS
                throw runtime_error("Number of threads must be between 1 and 64");
#else
                gLog << ___err___ << "ERROR - Number of threads must be between 1 and 64");
#endif
                return false;
            }

            if (fCompressionQueues.size() == num)
                return true;

            //cannot be const, as resize does not want it that way
            Queue<CompressionTarget> queue(bind(&zofits::CompressBuffer, this, placeholders::_1), false, false);

            //shrink
            if (num < fCompressionQueues.size())
            {
                fCompressionQueues.resize(num, queue);
                return true;
            }

            //grow
            fCompressionQueues.resize(num, queue);

            fNumQueues   = num;
            fQueueLooper = 0;

            return true;
        }


    private:

        bool reallocateBuffers()
        {
            size_t chunk_size = fRealRowWidth*fNumRowsPerTile + fRealColumns.size()*sizeof(BlockHeader) + sizeof(TileHeader) + 8; //+8 for checksuming;
            fMemPool.setChunkSize(chunk_size);

            fSmartBuffer = fMemPool.malloc();
            fBuffer = fSmartBuffer.get()->get();
//            memset(fBuffer, 0, 4);
//            fBuffer += 4;

            fRawSumBuffer.resize(fRealRowWidth + 4-fRealRowWidth%4); //for checksuming

            //give the catalog enough space
            fCatalog.resize(fNumTiles);
            for (uint32_t i=0;i<fNumTiles;i++)
            {
                fCatalog[i].resize(fRealColumns.size());
                for (auto it=fCatalog[i].begin(); it!=fCatalog[i].end(); it++)
                    *it = CatalogEntry(0,0);
            }
            return true;
        }

        bool writeCompressedDataToDisk(char* src, uint32_t sizeToWrite)
        {
            char* checkSumPointer = src+4;
            int32_t extraBytes = 0;
            uint32_t sizeToChecksum = sizeToWrite;
            if (fCheckOffset != 0)
            {//should we extend the array to the left ?
                sizeToChecksum += fCheckOffset;
                checkSumPointer -= fCheckOffset;
                memset(checkSumPointer, 0, fCheckOffset);
            }
            if (sizeToChecksum%4 != 0)
            {//should we extend the array to the right ?
                extraBytes = 4 - (sizeToChecksum%4);
                memset(checkSumPointer+sizeToChecksum, 0,extraBytes);
                sizeToChecksum += extraBytes;
            }

            //do the checksum
            fDataSum.add(checkSumPointer, sizeToChecksum);

            fCheckOffset = (4 - extraBytes)%4;
            //write data to disk
            write(src+4, sizeToWrite);

            return good();
        }

        bool CompressBuffer(const CompressionTarget& target)
        {
            //compress the buffer
            uint64_t compressed_size = compressBuffer(target.target.target.get()->get(), target.src.get()->get(), target.num_rows);

            //post the result to the writing queue
            //get a copy so that it becomes non-const
            WriteTarget wt;
            wt.tile_num = target.target.tile_num;
            wt.size     = compressed_size;
            wt.target   = target.target.target;

            fWriteToDiskQueue.post(wt);
            return true;
        }

        bool WriteBufferToDisk(const WriteTarget& target)
        {
            //is this the tile we're supposed to write ?
            if (target.tile_num != fLatestWrittenTile+1)
                return false;

            fLatestWrittenTile++;

            //write the buffer to disk.
            writeCompressedDataToDisk(target.target.get()->get(), target.size);

            return true;
        }

        //src cannot be const, as applySMOOTHING is done in place
        uint64_t compressBuffer(char* dest, char* src, uint32_t num_rows)
        {
            uint32_t thisRoundNumRows = (num_rows%fNumRowsPerTile) ? num_rows%fNumRowsPerTile : fNumRowsPerTile;
            uint32_t offset=0;
            uint32_t currentCatalogRow = (num_rows-1)/fNumRowsPerTile;

            //skip the checksum reserved area
            dest += 4;

            //skip the 'TILE' marker and tile size entry
            uint64_t compressedOffset = sizeof(TileHeader);

            //now compress each column one by one by calling compression on arrays
            for (uint32_t i=0;i<fRealColumns.size();i++)
            {
                fCatalog[currentCatalogRow][i].second = compressedOffset;

                if (fRealColumns[i].col.num == 0) continue;

                BlockHeader& head = fRealColumns[i].head;
                const vector<uint16_t>& sequence = fRealColumns[i].comp_sequence;

                //set the default byte telling if uncompressed the compressed Flag
                uint64_t previousOffset = compressedOffset;
                //skip header data
                compressedOffset += sizeof(BlockHeader) + sizeof(uint16_t)*sequence.size();

                for (uint32_t j=0;j<sequence.size(); j++)
                {
                    switch (sequence[j])
                    {
                        case zfits::kFactRaw:
                                compressedOffset += compressUNCOMPRESSED(dest + compressedOffset,
                                                                         src  + offset,
                                                                         thisRoundNumRows,
                                                                         fRealColumns[i].col.size,
                                                                         fRealColumns[i].col.num);
                        break;
                        case zfits::kFactSmoothing:
                                applySMOOTHING(dest + compressedOffset,
                                               src  + offset,
                                               thisRoundNumRows,
                                               fRealColumns[i].col.size,
                                               fRealColumns[i].col.num);
                        break;
                        case zfits::kFactHuffman16:
                            if (head.ordering == zfits::kOrderByCol)
                                compressedOffset += compressHUFFMAN(dest + compressedOffset,
                                                                    src  + offset,
                                                                    thisRoundNumRows,
                                                                    fRealColumns[i].col.size,
                                                                    fRealColumns[i].col.num);
                            else
                                compressedOffset += compressHUFFMAN(dest + compressedOffset,
                                                                    src  + offset,
                                                                    fRealColumns[i].col.num,
                                                                    fRealColumns[i].col.size,
                                                                    thisRoundNumRows);
                        break;
                        default:
                            cout << "ERROR: Unkown compression sequence entry: " << sequence[j] << endl;
                        break;
                    }
                }

                //check if compressed size is larger than uncompressed
                if (sequence[0] != zfits::kFactRaw &&
                    compressedOffset - previousOffset > fRealColumns[i].col.size*fRealColumns[i].col.num*thisRoundNumRows+sizeof(BlockHeader)+sizeof(uint16_t)*sequence.size())
                {//if so set flag and redo it uncompressed
                    cout << "REDOING UNCOMPRESSED" << endl;
                    compressedOffset = previousOffset + sizeof(BlockHeader) + 1;
                    compressedOffset += compressUNCOMPRESSED(dest + compressedOffset, src + offset, thisRoundNumRows, fRealColumns[i].col.size, fRealColumns[i].col.num);
                    BlockHeader he;
                    he.size = compressedOffset - previousOffset;
                    he.numProcs = 1;
                    he.ordering = zfits::kOrderByRow;
                    memcpy(dest + previousOffset, (char*)(&he), sizeof(BlockHeader));
                    dest[previousOffset+sizeof(BlockHeader)] = zfits::kFactRaw;
                    offset += thisRoundNumRows*fRealColumns[i].col.size*fRealColumns[i].col.num;
                    fCatalog[currentCatalogRow][i].first = compressedOffset - fCatalog[currentCatalogRow][i].second;
                    continue;
                }

                head.size = compressedOffset - previousOffset;
                memcpy(dest + previousOffset, (char*)(&head), sizeof(BlockHeader));
                memcpy(dest + previousOffset+sizeof(BlockHeader), sequence.data(), sizeof(uint16_t)*sequence.size());

                offset += thisRoundNumRows*fRealColumns[i].col.size*fRealColumns[i].col.num;
                fCatalog[currentCatalogRow][i].first = compressedOffset - fCatalog[currentCatalogRow][i].second;
            }

            TileHeader tile_head(thisRoundNumRows, compressedOffset);
            memcpy(dest, &tile_head, sizeof(TileHeader));

            return compressedOffset;
        }

        void copyTransposeTile(const char* src, char* dest)//uint32_t index)
        {
            uint32_t thisRoundNumRows = (fTable.num_rows%fNumRowsPerTile) ? fTable.num_rows%fNumRowsPerTile : fNumRowsPerTile;

            //copy the tile and transpose it
            for (uint32_t i=0;i<fRealColumns.size();i++)
            {
                switch (fRealColumns[i].head.ordering)
                {
                    case zfits::kOrderByRow:
                        for (uint32_t k=0;k<thisRoundNumRows;k++)
                        {//regular, "semi-transposed" copy
                            memcpy(dest, src+k*fRealRowWidth+fRealColumns[i].col.offset, fRealColumns[i].col.size*fRealColumns[i].col.num);
                            dest += fRealColumns[i].col.size*fRealColumns[i].col.num;
                        }
                    break;

                    case zfits::kOrderByCol :
                        for (int j=0;j<fRealColumns[i].col.num;j++)
                            for (uint32_t k=0;k<thisRoundNumRows;k++)
                            {//transposed copy
                                memcpy(dest, src+k*fRealRowWidth+fRealColumns[i].col.offset+fRealColumns[i].col.size*j, fRealColumns[i].col.size);
                                dest += fRealColumns[i].col.size;
                            }
                    break;
                    default:
                            cout << "Error: unknown column ordering: " << fRealColumns[i].head.ordering << endl;
                };
            }
        }

        /// Specific compression functions
        uint32_t compressUNCOMPRESSED(char* dest, const char* src, uint32_t numRows, uint32_t sizeOfElems, uint32_t numRowElems)
        {
            memcpy(dest, src, numRows*sizeOfElems*numRowElems);
            return numRows*sizeOfElems*numRowElems;
        }

        uint32_t compressHUFFMAN(char* dest, const char* src, uint32_t numRows, uint32_t sizeOfElems, uint32_t numRowElems)
        {
            string huffmanOutput;
            uint32_t previousHuffmanSize = 0;
            if (numRows < 2)
            {//if we have less than 2 elems to compress, Huffman encoder does not work (and has no point). Just return larger size than uncompressed to trigger the raw storage.
                return numRows*sizeOfElems*numRowElems + 1000;
            }
            if (sizeOfElems < 2 )
            {
                cout << "Fatal ERROR: HUFMANN can only encode short or longer types" << endl;
                return 0;
            }
            uint32_t huffmanOffset = 0;
            for (uint32_t j=0;j<numRowElems;j++)
            {
                Huffman::Encode(huffmanOutput,
                                reinterpret_cast<const uint16_t*>(&src[j*sizeOfElems*numRows]),
                                numRows*(sizeOfElems/2));
                reinterpret_cast<uint32_t*>(&dest[huffmanOffset])[0] = huffmanOutput.size() - previousHuffmanSize;
                huffmanOffset += sizeof(uint32_t);
                previousHuffmanSize = huffmanOutput.size();
            }
            const size_t totalSize = huffmanOutput.size() + huffmanOffset;

            //only copy if not larger than not-compressed size
            if (totalSize < numRows*sizeOfElems*numRowElems)
                memcpy(&dest[huffmanOffset], huffmanOutput.data(), huffmanOutput.size());

            return totalSize;
        }

        uint32_t applySMOOTHING(char* dest, char* src, uint32_t numRows, uint32_t sizeOfElems, uint32_t numRowElems)
        {
            uint32_t colWidth = numRowElems;
            for (int j=colWidth*numRows-1;j>1;j--)
                reinterpret_cast<int16_t*>(src)[j] = reinterpret_cast<int16_t*>(src)[j] - (reinterpret_cast<int16_t*>(src)[j-1]+reinterpret_cast<int16_t*>(src)[j-2])/2;

            return numRows*sizeOfElems*numRowElems;
        }

        //Offsets calibration stuff.
        vector<int16_t> fOffsetCalibration; ///< The calibration itself
        int32_t         fStartCellsOffset;  ///< Offset in bytes for the startcell data
        int32_t         fDataOffset;        ///< Offset in bytes for the data
        int32_t         fNumSlices;         ///< Number of samples per pixel per event

        //Compressed data stuff
        int32_t         fCheckOffset;       ///< offset to the data pointer to calculate the checksum
        uint32_t        fNumTiles;
        uint32_t        fNumRowsPerTile;

        //thread related stuff
        vector<Queue<CompressionTarget>> fCompressionQueues;
        Queue<WriteTarget>               fWriteToDiskQueue;

        //thread related stuff
        uint32_t          fNumQueues;    ///< The number of threads that will be used to compress
        uint32_t          fQueueLooper;
        int32_t           fLatestWrittenTile;

        struct CatalogEntry
        {
            CatalogEntry(int64_t f=0, int64_t s=0) : first(f), second(s) {};
            int64_t first;
            int64_t second;
        } __attribute__((__packed__));

        typedef vector<CatalogEntry>   CatalogRow;
        typedef vector<CatalogRow>     CatalogType;
        CatalogType          fCatalog;
        Checksum             fCatalogSum;
        Checksum             fRawSum;
        off_t                fCatalogOffset;
        uint32_t             fRealRowWidth;

        vector<char>         fRawSumBuffer;
        MemoryManager        fMemPool;
        uint64_t             fMaxUsableMem;

        shared_ptr<MemoryChunk> fSmartBuffer;
        char*                   fBuffer;


        struct CompressedColumn
        {
            CompressedColumn(Table::Column& c, BlockHeader& h, vector<uint16_t>& cs) : col(c),
                                                                                       head(h),
                                                                                       comp_sequence(cs)
            {}
            Table::Column    col;
            BlockHeader      head;
            vector<uint16_t> comp_sequence;
        };
        vector<CompressedColumn> fRealColumns;

};

#ifndef __MARS__
}; //namespace std
#endif

#ifdef crappy_example_usage
zofits zofitsfile(123456, 100);
zofitsfile.SetNumWorkingThreads(numThreads);
zofitsfile.open((fileNameOut).c_str());
std::zofits::BlockHeader zoheader(0, zfits::kOrderByRow, 2);
vector<uint16_t> smoothmanProcessings(2);
smoothmanProcessings[0] = zfits::kFactSmoothing;
smoothmanProcessings[1] = zfits::kFactHuffman16;

zofitsfile.AddColumn(sortedColumns[i].num,
                     sortedColumns[i].type,
                     colName,
                     "");

zofitsfile.AddColumn(sortedColumns[i].num,
                     sortedColumns[i].type,
                     colName,
                     "",
                     zoheader,
                     smoothmanProcessings);

zofitsfile.SetStr("ZCHKSUM", i->second.value, i->second.comment);
zofitsfile.SetDrsCalibration(drsCalibFloat);
zofitsfile.WriteTableHeader(tableName.c_str());
zofitsfile.WriteRow(buffer, rowWidth);
zofitsfile.close();

#endif
