/*
 * zofits.h
 *
 *  FACT native compressed FITS writer
 *      Author: lyard
 */

#include "ofits.h"
#include "Queue.h"
#include "MemoryManager.h"

#include "FITS.h"

#ifdef USE_BOOST_THREADS
#include <boost/thread.hpp>
#endif

using namespace FITS;

#ifndef __MARS__
namespace std
{
#else
using namespace std;
#endif

class zofits : public ofits
{
    public:

        struct WriteTarget
        {
            bool operator < (const WriteTarget& other)
            {
                return tile_num < other.tile_num;
            }
            uint32_t tile_num;
            uint32_t size;
            shared_ptr<MemoryChunk> target;
        };

        struct CompressionTarget
        {
            bool operator < (const CompressionTarget& other)
            {
                return target < other.target;
            }
            shared_ptr<MemoryChunk> src;
            shared_ptr<MemoryChunk> transposed_src;
            WriteTarget             target;
            uint32_t                num_rows;
        };


        //constructors
        zofits(uint32_t numTiles=1000,
               uint32_t rowPerTile=100,
               uint64_t maxUsableMem=0) : ofits(),
                                          fMemPool(0, maxUsableMem),
                                          fWriteToDiskQueue(bind(&zofits::WriteBufferToDisk, this, placeholders::_1), true, false)
        {
            InitMemberVariables(numTiles, rowPerTile, maxUsableMem);
            SetNumWorkingThreads(fNumQueues);
        }

        zofits(const char* fname,
               uint32_t numTiles=1000,
               uint32_t rowPerTile=100,
               uint64_t maxUsableMem=0) : ofits(fname),
                                          fMemPool(0, maxUsableMem),
                                          fWriteToDiskQueue(bind(&zofits::WriteBufferToDisk, this, placeholders::_1), true, false)
        {
            InitMemberVariables(numTiles, rowPerTile, maxUsableMem);
            SetNumWorkingThreads(fNumQueues);
        }

        virtual ~zofits()
        {
        }

        //initialization of member variables
        void InitMemberVariables(uint32_t nt=0, uint32_t rpt=0, uint64_t maxUsableMem=0)
        {
            if (nt == 0)
                throw runtime_error("Cannot work with a catalog of size 0. sorry.");

            fCheckOffset  = 0;

            fNumTiles       = nt;
            fNumRowsPerTile = rpt;

            fBuffer       = NULL;
            fRealRowWidth = 0;
            fCatalogExtraRows = 0;

            fCatalogOffset    =  0;

            fMaxUsableMem = maxUsableMem;
#ifdef __EXCEPTIONS
            fThreadsException = exception_ptr();
#endif
        }


        //write the header of the binary table
        virtual bool WriteTableHeader(const char* name="DATA")
        {
            if (!reallocateBuffers())
                throw ("While allocating memory: apparently there not as much free memory as advertized...");

            ofits::WriteTableHeader(name);

            if (fNumQueues != 0)
            {
                //start the compression queues
                for (auto it=fCompressionQueues.begin(); it!= fCompressionQueues.end(); it++)
                    it->start();

                fWriteToDiskQueue.start();
            }

            //mark that no tile has been written so far
            fLatestWrittenTile = -1;

            return good();
        }

        void open(const char* filename, bool addEXTNAMEKey=true)
        {
            ofits::open(filename, addEXTNAMEKey);

            //add compression-related header entries
            SetBool("ZTABLE", true, "Table is compressed");
            SetInt("ZNAXIS1", 0, "Width of uncompressed rows");
            SetInt("ZNAXIS2", 0, "Number of uncompressed rows");
            SetInt("ZPCOUNT", 0, "");
            SetInt("ZHEAPPTR", 0, "");
            SetInt("ZTILELEN", fNumRowsPerTile, "Number of rows per tile");
            SetInt("THEAP", 0, "");
            SetStr("RAWSUM", "         0", "Checksum of raw little endian data");
            SetFloat("ZRATIO", 0, "Compression ratio");

            fCatalogExtraRows = 0;
            fRawSum.reset();
        }

        virtual bool WriteDrsOffsetsTable()
        {
            return good();
        }

        uint32_t GetBytesPerRow() const
        {
            return fRealRowWidth;
        }

        bool WriteCatalog()
        {
            const uint32_t one_catalog_row_size = fTable.num_cols*2*sizeof(uint64_t);
            const uint32_t total_catalog_size = fCatalog.size()*one_catalog_row_size;

            vector<char> swapped_catalog(total_catalog_size);
            uint32_t shift = 0;
            for (auto it=fCatalog.begin(); it!=fCatalog.end(); it++)
            {
                revcpy<sizeof(uint64_t)>(swapped_catalog.data() + shift, (char*)(it->data()), fTable.num_cols*2);
                shift += one_catalog_row_size;
            }

            if (fCatalogOffset == 0)
            {
                fCatalogOffset = tellp();
            }

            const off_t where_are_we = tellp();

            seekp(fCatalogOffset);
            write(swapped_catalog.data(), total_catalog_size);
            if (where_are_we != fCatalogOffset)
                seekp(where_are_we);

            fCatalogSum.reset();
            fCatalogSum.add(swapped_catalog.data(), total_catalog_size);

            return good();
        }
        virtual void DrsOffsetCalibrate(char* )
        {

        }

        void GrowCatalog()
        {
            uint32_t orig_catalog_size = fCatalog.size();

            fCatalog.resize(fCatalog.size()*2);
            for (uint32_t i=orig_catalog_size;i<fCatalog.size(); i++)
            {
                fCatalog[i].resize(fTable.num_cols);
                for (auto it=(fCatalog[i].begin()); it!=fCatalog[i].end(); it++)
                    *it = CatalogEntry(0,0);
            }

            fCatalogExtraRows += orig_catalog_size;
            fNumTiles         += orig_catalog_size;
        }

        bool WriteRow(const void* ptr, size_t cnt, bool byte_swap=true)
        {
            if (cnt != fRealRowWidth)
            {
#ifdef __EXCEPTIONS
                throw runtime_error("Wrong size of row given to WriteRow");
#else
                gLog << ___err___ << "ERROR - Wrong size of row given to WriteRow" << endl;
                return false;
#endif
            }

            if (fTable.num_rows >= fNumRowsPerTile*fNumTiles)
            {
//                GrowCatalog();
#ifdef __EXCEPTIONS
                throw runtime_error("Maximum number of rows exceeded for this file");
#else
                gLog << ___err___ << "ERROR - Maximum number of rows exceeded for this file" << endl;
                return false;
#endif
            }

            //copy current row to pool or rows waiting for compression
            char* target_location = fBuffer + fRealRowWidth*(fTable.num_rows%fNumRowsPerTile);
            memcpy(target_location, ptr, fRealRowWidth);

            //for now, make an extra copy of the data, for RAWSUM checksuming.
            //Ideally this should be moved to the threads, along with the drs-offset-calibration
            //However, because the RAWSUM must be calculated before the tile is transposed, I am not sure whether
            //one extra memcpy per row written is worse than 100 rows checksumed when the tile is full....
            const uint32_t rawOffset = (fTable.num_rows*fRealRowWidth)%4;
            char* buffer = fRawSumBuffer.data() + rawOffset;
            auto ib = fRawSumBuffer.begin();
            auto ie = fRawSumBuffer.rbegin();
            *ib++ = 0;
            *ib++ = 0;
            *ib++ = 0;
            *ib   = 0;

            *ie++ = 0;
            *ie++ = 0;
            *ie++ = 0;
            *ie   = 0;

            memcpy(buffer, ptr, fRealRowWidth);

            fRawSum.add(fRawSumBuffer, false);

            DrsOffsetCalibrate(target_location);

            fTable.num_rows++;

            if (fTable.num_rows % fNumRowsPerTile == 0)
            {
                CompressionTarget compress_target;
                SetNextCompression(compress_target);

                if (fNumQueues == 0)
                { //no worker threads. do everything in-line
                    uint64_t size_to_write = CompressBuffer(compress_target);

                    WriteTarget write_target;
                    write_target.size     = size_to_write;
                    write_target.target   = compress_target.target.target;
                    write_target.tile_num = compress_target.target.tile_num;

                    if (!WriteBufferToDisk(write_target))
                        throw runtime_error("Something went wrong while writing to disk");
                }
                else
                {
                    //if all queues are empty, use queue 0
                     uint32_t min_index     = 0;
                     uint32_t min_size      = numeric_limits<uint32_t>::max();
                     uint32_t current_index = 0;

                     for (auto it=fCompressionQueues.begin(); it!=fCompressionQueues.end(); it++)
                     {
                         if (it->size() < min_size)
                         {
                             min_index = current_index;
                             min_size = it->size();
                         }
                         current_index++;
                     }

                    if (!fCompressionQueues[min_index].post(compress_target))
                        throw runtime_error("I could not post this buffer. This does not make sense...");
                }
            }

            return good();
        }

        void FlushNumRows()
        {
            SetInt("NAXIS2", fTable.num_rows/fNumRowsPerTile);
            SetInt("ZNAXIS2", fTable.num_rows);
            FlushHeader();
        }

        void SetNextCompression(CompressionTarget& target)
        {
            //get space for transposed data
            shared_ptr<MemoryChunk> transposed_data = fMemPool.malloc();

            //fill up write to disk target
            WriteTarget write_target;
            write_target.tile_num = (fTable.num_rows-1)/fNumRowsPerTile;
            write_target.size     = 0;
            write_target.target   = fMemPool.malloc();

            //fill up compression target
            target.src            = fSmartBuffer;
            target.transposed_src      = transposed_data;
            target.target   = write_target;
            target.num_rows = fTable.num_rows;

            //get a new buffer to host the incoming data
            fSmartBuffer = fMemPool.malloc();
            fBuffer      = fSmartBuffer.get()->get();
        }

        void ShrinkCatalog()
        {
            //did we write more rows than what the catalog could host ?
            if (fCatalogExtraRows != 0)
            {
                //how many rows can the regular catalog host ?
                const uint32_t max_regular_rows = (fCatalog.size() - fCatalogExtraRows)*fNumRowsPerTile;
                //what's the shrink factor to be applied ?
                const uint32_t shrink_factor = fTable.num_rows/max_regular_rows + ((fTable.num_rows%max_regular_rows) ? 1 : 0);

                //shrink the catalog !
                for (uint32_t i=0; i<fTable.num_rows/fNumRowsPerTile; i+= shrink_factor)
                {//add the elements one by one, so that the empty ones at the end (i.e. fTable.num_rows%shrink_factor) do not create havok
                    const uint32_t target_catalog_row = i/shrink_factor;
                    //move data from current row (i) to target row
                    for (uint32_t j=0; j<fTable.num_cols; j++)
                    {
                        fCatalog[target_catalog_row][j].second = fCatalog[i][j].second;
                        fCatalog[target_catalog_row][j].first  = 0;
                        uint64_t last_size   = fCatalog[i][j].first;
                        uint64_t last_offset = fCatalog[i][j].second;

                        for (uint32_t k=1; k<shrink_factor; k++)
                        {
                           if (fCatalog[i+k][j].second != 0)
                           {
                               fCatalog[target_catalog_row][j].first +=  fCatalog[i+k][j].second - last_offset;
                           }
                           else
                           {
                               fCatalog[target_catalog_row][j].first += last_size;
                               break;
                           }
                           last_size   = fCatalog[i+k][j].first;
                           last_offset = fCatalog[i+k][j].second;
                        }
                    }
                }

                fCatalog.resize(fCatalog.size() - fCatalogExtraRows);

                //update header keywords
                const uint32_t new_num_rows_per_tiles = fNumRowsPerTile*shrink_factor;
                const uint32_t new_num_tiles_written = (fTable.num_rows + new_num_rows_per_tiles-1)/new_num_rows_per_tiles;
                SetInt("THEAP", new_num_tiles_written*2*sizeof(int64_t)*fTable.num_cols);
                SetInt("NAXIS2", new_num_tiles_written);
                SetInt("ZTILELEN", new_num_rows_per_tiles);
                cout << "New num rows per tiles: " << new_num_rows_per_tiles << " shrink factor: " << shrink_factor << endl;
                cout << "Num tiles written: " << new_num_tiles_written << endl;
            }
        }

        bool close()
        {
            for (auto it=fCompressionQueues.begin(); it != fCompressionQueues.end(); it++)
                it->wait();

            fWriteToDiskQueue.wait();

            if (tellp() < 0)
            {
#ifdef __EXCEPTIONS
                throw runtime_error("Something went wrong while writing to disk...");
#else
                return false;
#endif
            }

#ifdef __EXCEPTIONS
            //check if something hapenned to the compression threads
            if (fThreadsException != exception_ptr())
            {
                rethrow_exception(fThreadsException);
            }
#endif

            if (fTable.num_rows%fNumRowsPerTile != 0)
            {
                CompressionTarget compress_target;
                SetNextCompression(compress_target);

                //set number of threads to zero before calling compressBuffer
                int32_t backup_num_queues = fNumQueues;
                fNumQueues = 0;
                uint64_t size_to_write = CompressBuffer(compress_target);
                fNumQueues = backup_num_queues;

                WriteTarget write_target;
                write_target.size     = size_to_write;
                write_target.target   = compress_target.target.target;
                write_target.tile_num = compress_target.target.tile_num;

                if (!WriteBufferToDisk(write_target))
                    throw runtime_error("Something went wrong while writing the last tile...");
            }

            AlignTo2880Bytes();

            //update header keywords
            SetInt("ZNAXIS1", fRealRowWidth);
            SetInt("ZNAXIS2", fTable.num_rows);

            uint64_t heap_offset = fCatalog.size()*fTable.num_cols*sizeof(uint64_t)*2;
            SetInt("ZHEAPPTR", heap_offset);

            const uint32_t total_num_tiles_written = (fTable.num_rows + fNumRowsPerTile-1)/fNumRowsPerTile;

            SetInt("THEAP", total_num_tiles_written*2*sizeof(int64_t)*fTable.num_cols);

            SetInt("NAXIS1", 2*sizeof(int64_t)*fTable.num_cols);
            SetInt("NAXIS2", total_num_tiles_written);

            ostringstream str;
            str << fRawSum.val();
            SetStr("RAWSUM", str.str());

            int64_t heap_size = 0;
            int64_t compressed_offset = 0;

            for (uint32_t i=0; i<total_num_tiles_written; i++)
            {
                compressed_offset += sizeof(TileHeader);
                heap_size         += sizeof(TileHeader);
                for (uint32_t j=0; j<fCatalog[i].size(); j++)
                {
                    heap_size += fCatalog[i][j].first;
                    fCatalog[i][j].second = compressed_offset;
                    compressed_offset += fCatalog[i][j].first;
                    if (fCatalog[i][j].first == 0)
                        fCatalog[i][j].second = 0;
                }
            }

            float compression_ratio = (float)(fRealRowWidth*fTable.num_rows)/(float)heap_size;
            SetFloat("ZRATIO", compression_ratio);

            //add to the heap size the size of the gap between the catalog and the actual heap
            heap_size += (fCatalog.size() - total_num_tiles_written)*fTable.num_cols*sizeof(uint64_t)*2;

            SetInt("PCOUNT", heap_size, "size of special data area");


            //Just for updating the fCatalogSum value
            WriteCatalog();

            fDataSum += fCatalogSum;

            const Checksum checksm = UpdateHeaderChecksum();

            ofstream::close();

            if ((checksm+fDataSum).valid())
                return true;

            ostringstream sout;
            sout << "Checksum (" << std::hex << checksm.val() << ") invalid.";
#ifdef __EXCEPTIONS
            throw runtime_error(sout.str());
#else
            gLog << ___err___ << "ERROR - " << sout.str() << endl;
            return false;
#endif
        }

        //Overload of the ofits method. Just calls the zofits specific one with default, uncompressed options for this column
        bool AddColumn(uint32_t cnt, char typechar, const string& name, const string& unit, const string& comment="", bool addHeaderKeys=true)
        {
            BlockHeaderWriter head;
            return AddColumn(cnt, typechar, name, unit, head, comment, addHeaderKeys);
        }

        bool AddColumn(const string& compressionScheme, uint32_t cnt, char typechar, const string& name, const string& unit,  const string& comment="", bool addHeaderKeys=true)
        {
            BlockHeaderWriter head(compressionScheme);
            return AddColumn(cnt, typechar, name, unit, head, comment, addHeaderKeys);
        }
        bool AddColumn(uint32_t cnt, char typechar, const string& name, const string& unit, const BlockHeaderWriter& header, const string& comment="", bool addHeaderKeys=true)
        {
            if (!ofits::AddColumn(1, 'Q', name, unit, comment, addHeaderKeys))
                return false;

            Table::Column col;
            size_t size = SizeFromType(typechar);

            col.name   = name;
            col.type   = typechar;
            col.num    = cnt;
            col.size   = size;
            col.offset = fRealRowWidth;

            fRealRowWidth += size*cnt;

            fRealColumns.emplace_back(CompressedColumn(col, header));

            ostringstream strKey, strVal, strCom;
            strKey << "ZFORM" << fRealColumns.size();
            strVal << cnt << typechar;
            strCom << "format of " << name << " [" << CommentFromType(typechar);
            SetStr(strKey.str(), strVal.str(), strCom.str());

            strKey.str("");
            strVal.str("");
            strCom.str("");
            strKey << "ZCTYP" << fRealColumns.size();
            strVal << "FACT";
            strCom << "Compression type FACT";
            SetStr(strKey.str(), strVal.str(), strCom.str());

            return true;
        }

        bool AddColumnShort(const string& compressionScheme, uint32_t cnt, const string &name, const string &unit="", const string &comment="")
        { return AddColumn(compressionScheme, cnt, 'I', name, unit, comment); }
        bool AddColumnInt(const string& compressionScheme, uint32_t cnt, const string &name, const string &unit="", const string &comment="")
        { return AddColumn(compressionScheme, cnt, 'J', name, unit, comment); }
        bool AddColumnLong(const string& compressionScheme, uint32_t cnt, const string &name, const string &unit="", const string &comment="")
        { return AddColumn(compressionScheme, cnt, 'K', name, unit, comment); }
        bool AddColumnFloat(const string& compressionScheme, uint32_t cnt, const string &name, const string &unit="", const string &comment="")
        { return AddColumn(compressionScheme, cnt, 'E', name, unit, comment); }
        bool AddColumnDouble(const string& compressionScheme, uint32_t cnt, const string &name, const string &unit="", const string &comment="")
        { return AddColumn(compressionScheme, cnt, 'D', name, unit, comment); }
        bool AddColumnChar(const string& compressionScheme, uint32_t cnt, const string &name, const string &unit="", const string &comment="")
        { return AddColumn(compressionScheme, cnt, 'A', name, unit, comment); }
        bool AddColumnByte(const string& compressionScheme, uint32_t cnt, const string &name, const string &unit="", const string &comment="")
        { return AddColumn(compressionScheme, cnt, 'B', name, unit, comment); }
        bool AddColumnBool(const string& compressionScheme, uint32_t cnt, const string &name, const string &unit="", const string &comment="")
        { return AddColumn(compressionScheme, cnt, 'L', name, unit, comment); }

        static void SetNumThreads(int32_t num) { fNumQueues = num;}
        static int32_t GetNumThreads() { return fNumQueues;}
    protected:

        bool SetNumWorkingThreads(int32_t num)
        {
            if (is_open())
            {
#ifdef __EXCEPTIONS
                throw runtime_error("File must be closed before changing the number of compression threads");
#else
                gLog << ___err___ << "ERROR - File must be closed before changing the number of compression threads";
#endif
                return false;
            }
#ifdef USE_BOOST_THREADS
            int32_t num_available_cores = boost::thread::hardware_concurrency();
#else
            int32_t num_available_cores = thread::hardware_concurrency();
#endif

            if (num_available_cores == 0)
            {//could not detect number of available cores from system properties...
                //Assuming that 5 cores are availables (4 compression, 1 write)
                num_available_cores = 5;
            }
            if (num > num_available_cores)
            {
                ostringstream str;
                str << "Number of threads cannot be greater than physically available (" << num_available_cores << ")";
#ifdef __EXCEPTIONS
                throw runtime_error(str.str());
#else
                gLog << ___err___ << "ERROR - " << str.str();
#endif
                return false;
            }

            if (num == -1)
                num = num_available_cores-2; // 1 for writing, one for the main thread

            if (fCompressionQueues.size() == (uint32_t)num)
                return true;

            //cannot be const, as resize does not want it that way
            Queue<CompressionTarget> queue(bind(&zofits::CompressBuffer, this, placeholders::_1), false, false);

            //shrink
            if ((uint32_t)num < fCompressionQueues.size())
            {
                fCompressionQueues.resize(num, queue);
                return true;
            }

            //grow
            fCompressionQueues.resize(num, queue);

            fNumQueues = num;

            return true;
        }

        bool reallocateBuffers()
        {
            size_t chunk_size = fRealRowWidth*fNumRowsPerTile + fRealColumns.size()*sizeof(BlockHeader) + sizeof(TileHeader) + 8; //+8 for checksuming;
            fMemPool.setChunkSize(chunk_size);

            fSmartBuffer = fMemPool.malloc();
            fBuffer      = fSmartBuffer.get()->get();

            fRawSumBuffer.resize(fRealRowWidth + 4-fRealRowWidth%4); //for checksuming

            //give the catalog enough space
            fCatalog.resize(fNumTiles);
            for (uint32_t i=0;i<fNumTiles;i++)
            {
                fCatalog[i].resize(fRealColumns.size());
                for (auto it=fCatalog[i].begin(); it!=fCatalog[i].end(); it++)
                    *it = CatalogEntry(0,0);
            }
            return true;
        }

        bool writeCompressedDataToDisk(char* src, uint32_t sizeToWrite)
        {
            char* checkSumPointer = src+4;
            int32_t extraBytes = 0;
            uint32_t sizeToChecksum = sizeToWrite;
            if (fCheckOffset != 0)
            {//should we extend the array to the left ?
                sizeToChecksum += fCheckOffset;
                checkSumPointer -= fCheckOffset;
                memset(checkSumPointer, 0, fCheckOffset);
            }
            if (sizeToChecksum%4 != 0)
            {//should we extend the array to the right ?
                extraBytes = 4 - (sizeToChecksum%4);
                memset(checkSumPointer+sizeToChecksum, 0,extraBytes);
                sizeToChecksum += extraBytes;
            }

            //do the checksum
            fDataSum.add(checkSumPointer, sizeToChecksum);

            fCheckOffset = (4 - extraBytes)%4;
            //write data to disk
            write(src+4, sizeToWrite);

            return good();
        }

        uint32_t CompressBuffer(const CompressionTarget& target)
        {
            uint64_t compressed_size = 0;
#ifdef __EXCEPTIONS
            try
            {
#endif
                //transpose the original data
                copyTransposeTile(target.src.get()->get(), target.transposed_src.get()->get());

                //compress the buffer
                compressed_size = compressBuffer(target.target.target.get()->get(), target.transposed_src.get()->get(), target.num_rows);
#ifdef __EXCEPTIONS
            }
            catch (...)
            {
                fThreadsException = current_exception();
                if (fNumQueues == 0)
                    rethrow_exception(fThreadsException);
            }
#endif

            if (fNumQueues == 0)
                return compressed_size;

            //post the result to the writing queue
            //get a copy so that it becomes non-const
            WriteTarget wt;
            wt.tile_num = target.target.tile_num;
            wt.size     = compressed_size;
            wt.target   = target.target.target;

            fWriteToDiskQueue.post(wt);

            return compressed_size;
        }

        bool WriteBufferToDisk(const WriteTarget& target)
        {
            //is this the tile we're supposed to write ?
            if (target.tile_num != (uint32_t)(fLatestWrittenTile+1))
                return false;

            fLatestWrittenTile++;

            //write the buffer to disk.
            return writeCompressedDataToDisk(target.target.get()->get(), target.size);
        }

        //src cannot be const, as applySMOOTHING is done in place
        uint64_t compressBuffer(char* dest, char* src, uint32_t num_rows)
        {
            uint32_t thisRoundNumRows = (num_rows%fNumRowsPerTile) ? num_rows%fNumRowsPerTile : fNumRowsPerTile;
            uint32_t offset=0;
            uint32_t currentCatalogRow = (num_rows-1)/fNumRowsPerTile;

            //skip the checksum reserved area
            dest += 4;

            //skip the 'TILE' marker and tile size entry
            uint64_t compressedOffset = sizeof(TileHeader);

            //now compress each column one by one by calling compression on arrays
            for (uint32_t i=0;i<fRealColumns.size();i++)
            {
                fCatalog[currentCatalogRow][i].second = compressedOffset;

                if (fRealColumns[i].col.num == 0) continue;

                BlockHeaderWriter& head = fRealColumns[i].block_head;

                //set the default byte telling if uncompressed the compressed Flag
                uint64_t previousOffset = compressedOffset;

                //skip header data
                compressedOffset += head.SizeOnDisk();

                for (uint32_t j=0;j<head.NumProcs();j++)//sequence.size(); j++)
                {
                    switch (head.Proc(j))
                    {
                        case kFactRaw:
                                compressedOffset += compressUNCOMPRESSED(dest + compressedOffset, src  + offset, thisRoundNumRows*fRealColumns[i].col.size*fRealColumns[i].col.num);
                        break;
                        case kFactSmoothing:
                                applySMOOTHING(src + offset, thisRoundNumRows*fRealColumns[i].col.num);
                        break;
                        case kFactHuffman16:
                            if (head.Ordering() == kOrderByCol)
                                compressedOffset += compressHUFFMAN(dest + compressedOffset, src  + offset, thisRoundNumRows, fRealColumns[i].col.size, fRealColumns[i].col.num);
                            else
                                compressedOffset += compressHUFFMAN(dest + compressedOffset, src  + offset, fRealColumns[i].col.num, fRealColumns[i].col.size, thisRoundNumRows);
                        break;
                        default:
                        {
                            ostringstream str;
                            str << "Unkown compression sequence entry: " << head.Proc(j);
#ifdef __EXCEPTIONS
                            throw runtime_error(str.str());
#else
                            gLog << ___err___ << "ERROR - " << str.str();
                            return 0;
#endif
                        }
                    }
                }

               //check if compressed size is larger than uncompressed
                if ((head.Proc(0) != kFactRaw) && (compressedOffset - previousOffset > fRealColumns[i].col.size*fRealColumns[i].col.num*thisRoundNumRows+head.SizeOnDisk()))// && two)
                {//if so set flag and redo it uncompressed
                    cout << "Redoing uncompressed ! " << endl;
                    //de-smooth !
                    if (head.Proc(0) == kFactSmoothing)
                        UnApplySMOOTHING(src+offset, fRealColumns[i].col.num*thisRoundNumRows);

                    BlockHeaderWriter he;
                    compressedOffset = previousOffset + he.SizeOnDisk();
                    compressedOffset += compressUNCOMPRESSED(dest + compressedOffset, src + offset, thisRoundNumRows*fRealColumns[i].col.size*fRealColumns[i].col.num);
                    he.SetBlockSize(compressedOffset - previousOffset);
                    he.Write(dest+previousOffset);
                    offset += thisRoundNumRows*fRealColumns[i].col.size*fRealColumns[i].col.num;
                    fCatalog[currentCatalogRow][i].first = compressedOffset - fCatalog[currentCatalogRow][i].second;
                    continue;
                }

                head.SetBlockSize(compressedOffset - previousOffset);
                head.Write(dest + previousOffset);

                offset += thisRoundNumRows*fRealColumns[i].col.size*fRealColumns[i].col.num;
                fCatalog[currentCatalogRow][i].first = compressedOffset - fCatalog[currentCatalogRow][i].second;
            }

            TileHeader tile_head(thisRoundNumRows, compressedOffset);
            memcpy(dest, &tile_head, sizeof(TileHeader));

            return compressedOffset;
        }

        void copyTransposeTile(const char* src, char* dest)
        {
            uint32_t thisRoundNumRows = (fTable.num_rows%fNumRowsPerTile) ? fTable.num_rows%fNumRowsPerTile : fNumRowsPerTile;

            //copy the tile and transpose it
            for (uint32_t i=0;i<fRealColumns.size();i++)
            {
                switch (fRealColumns[i].block_head.Ordering())
                {
                    case kOrderByRow:
                        for (uint32_t k=0;k<thisRoundNumRows;k++)
                        {//regular, "semi-transposed" copy
                            memcpy(dest, src+k*fRealRowWidth+fRealColumns[i].col.offset, fRealColumns[i].col.size*fRealColumns[i].col.num);
                            dest += fRealColumns[i].col.size*fRealColumns[i].col.num;
                        }
                    break;

                    case kOrderByCol :
                        for (uint32_t j=0;j<fRealColumns[i].col.num;j++)
                            for (uint32_t k=0;k<thisRoundNumRows;k++)
                            {//transposed copy
                                memcpy(dest, src+k*fRealRowWidth+fRealColumns[i].col.offset+fRealColumns[i].col.size*j, fRealColumns[i].col.size);
                                dest += fRealColumns[i].col.size;
                            }
                    break;
                    default:
                    {
                            ostringstream str;
                            str << "Unkown column ordering: " << fRealColumns[i].block_head.Ordering();
#ifdef __EXCEPTIONS
                            throw runtime_error(str.str());
#else
                            gLog << ___err___ << "ERROR - " << str.str();
                            return;
#endif
                    }
                };
            }
        }

        /// Specific compression functions
        uint32_t compressUNCOMPRESSED(char* dest, const char* src, uint32_t size)
        {
            memcpy(dest, src, size);
            return size;
        }

        uint32_t compressHUFFMAN(char* dest, const char* src, uint32_t numRows, uint32_t sizeOfElems, uint32_t numRowElems)
        {
            string huffmanOutput;
            uint32_t previousHuffmanSize = 0;
            if (numRows < 2)
            {//if we have less than 2 elems to compress, Huffman encoder does not work (and has no point). Just return larger size than uncompressed to trigger the raw storage.
                return numRows*sizeOfElems*numRowElems + 1000;
            }
            if (sizeOfElems < 2 )
            {
#ifdef __EXCEPTIONS
                throw runtime_error("Fatal ERROR: HUFMANN can only encode short or longer types");
#else
                gLog << ___err___ << "ERROR - Fatal ERROR: HUFMANN can only encode short or longer types";
                return 0;
#endif
            }
            uint32_t huffmanOffset = 0;
            for (uint32_t j=0;j<numRowElems;j++)
            {
                Huffman::Encode(huffmanOutput,
                                reinterpret_cast<const uint16_t*>(&src[j*sizeOfElems*numRows]),
                                numRows*(sizeOfElems/2));
                reinterpret_cast<uint32_t*>(&dest[huffmanOffset])[0] = huffmanOutput.size() - previousHuffmanSize;
                huffmanOffset += sizeof(uint32_t);
                previousHuffmanSize = huffmanOutput.size();
            }
            const size_t totalSize = huffmanOutput.size() + huffmanOffset;

            //only copy if not larger than not-compressed size
            if (totalSize < numRows*sizeOfElems*numRowElems)
                memcpy(&dest[huffmanOffset], huffmanOutput.data(), huffmanOutput.size());

            return totalSize;
        }

        uint32_t applySMOOTHING(char* data, uint32_t numElems)//uint32_t numRows, uint32_t sizeOfElems, uint32_t numRowElems)
        {
            int16_t* short_data = reinterpret_cast<int16_t*>(data);
            for (int j=numElems-1;j>1;j--)
                short_data[j] = short_data[j] - (short_data[j-1]+short_data[j-2])/2;

            return numElems*sizeof(int16_t);
        }
        // Apply the inverse transform of the integer smoothing
        uint32_t UnApplySMOOTHING(char*   data, uint32_t   numElems)
        {
            int16_t* short_data = reinterpret_cast<int16_t*>(data);
            //un-do the integer smoothing
            for (uint32_t j=2;j<numElems;j++)
                short_data[j] = short_data[j] + (short_data[j-1]+short_data[j-2])/2;

            return numElems*sizeof(uint16_t);
        }
        //Compressed data stuff
        int32_t         fCheckOffset;       ///< offset to the data pointer to calculate the checksum
        uint32_t        fNumTiles;
        uint32_t        fNumRowsPerTile;

        MemoryManager        fMemPool;

        //thread related stuff
        vector<Queue<CompressionTarget>> fCompressionQueues;
        Queue<WriteTarget>               fWriteToDiskQueue;

        //thread related stuff
        static int32_t          fNumQueues;    ///< The number of threads that will be used to compress

        int32_t           fLatestWrittenTile;
#ifdef __EXCEPTIONS
        exception_ptr     fThreadsException;
#endif
        struct CatalogEntry
        {
            CatalogEntry(int64_t f=0, int64_t s=0) : first(f), second(s) {};
            int64_t first;
            int64_t second;
        } __attribute__((__packed__));

        typedef vector<CatalogEntry>   CatalogRow;
        typedef vector<CatalogRow>     CatalogType;
        CatalogType          fCatalog;
        Checksum             fCatalogSum;
        Checksum             fRawSum;
        off_t                fCatalogOffset;
        uint32_t             fRealRowWidth;
        uint32_t             fCatalogExtraRows;
        vector<char>         fRawSumBuffer;
        uint64_t             fMaxUsableMem;

        shared_ptr<MemoryChunk> fSmartBuffer;
        char*                   fBuffer;

        struct CompressedColumn
        {
            CompressedColumn(const Table::Column& c, const BlockHeaderWriter& h) : col(c),
                                                                                   block_head(h)
            {}
            Table::Column     col;
            BlockHeaderWriter block_head;
        };
        vector<CompressedColumn> fRealColumns;

};

int32_t zofits::fNumQueues = 0;

#ifndef __MARS__
}; //namespace std
#endif

#ifdef crappy_example_usage
zofits zofitsfile(123456, 100);
zofitsfile.SetNumWorkingThreads(numThreads);
zofitsfile.open((fileNameOut).c_str());
std::zofits::BlockHeader zoheader(0, kOrderByRow, 2);
vector<uint16_t> smoothmanProcessings(2);
smoothmanProcessings[0] = kFactSmoothing;
smoothmanProcessings[1] = kFactHuffman16;

zofitsfile.AddColumn(sortedColumns[i].num,
                     sortedColumns[i].type,
                     colName,
                     "");

zofitsfile.AddColumn(sortedColumns[i].num,
                     sortedColumns[i].type,
                     colName,
                     "",
                     zoheader,
                     smoothmanProcessings);

zofitsfile.SetStr("ZCHKSUM", i->second.value, i->second.comment);
zofitsfile.SetDrsCalibration(drsCalibFloat);
zofitsfile.WriteTableHeader(tableName.c_str());
zofitsfile.WriteRow(buffer, rowWidth);
zofitsfile.close();

#endif
