#include <stdint.h>
#include <vector>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <stdio.h>      /* printf */
#include <time.h>       /* time_t, struct tm, difftime, time, mktime */


#define fNumChannels 1440
std::vector<int64_t> fSum;
std::vector<int64_t> fSum2;

void AddRel_complex(const int16_t *val, const int16_t *start)
{
    // This version is 2.5 times faster because the compilers optimization
    // is not biased by the evaluation of %1024
    for (size_t ch=0; ch<fNumChannels; ch++)
    {
        const int16_t &spos = start[ch];
        if (spos<0)
            continue;

        const size_t pos = ch*1024;

        const int16_t *beg_val  = val          + pos;
        int64_t       *beg_sum  = fSum.data()  + pos;
        int64_t       *beg_sum2 = fSum2.data() + pos;

        const int16_t *pval  = beg_val;          // val[rel]
        int64_t       *psum  = beg_sum  + spos;  // fSum[abs]
        int64_t       *psum2 = beg_sum2 + spos;  // fSum2[abs]

        while (psum<beg_sum+1024)
        {
            const int64_t v = *pval++;

            *psum++  += v;
            *psum2++ += v*v;
        }

        psum  = beg_sum;
        psum2 = beg_sum2;

        while (pval<beg_val+1024)
        {
            const int64_t v = *pval++;

            *psum++  += v;
            *psum2++ += v*v;
        }
    }
}

void AddRel_simple(const int16_t *val, const int16_t *start)
{
    for (size_t ch=0; ch<fNumChannels; ch++)
    {
        const int16_t &spos = start[ch];
        if (spos<0)
            continue;

        const size_t pos = ch*1024;
        for (size_t i=0; i<1024; i++)
        {
            // Value is relative to trigger
            // Abs is corresponding index relative to DRS pipeline
            const size_t rel = pos +  i;
            const size_t abs = pos + (spos+i)%1024;

            const int64_t v = val[rel];

            fSum[abs]  += v;
            fSum2[abs] += v*v;
        }
    }
}

int main(void){
    const int REPS = 1000;
    clock_t t;
    std::vector<int16_t> values(fNumChannels * 1024, 3);
    std::vector<int16_t> start(fNumChannels, 0);
    for (size_t i=0; i<start.size(); i++){
        start[i] = rand()%1024;
    }


    fSum.resize(fNumChannels * 1024);
    fSum2.resize(fNumChannels * 1024);
    t = clock();
    for(int i=0; i<REPS; i++){
        AddRel_simple(
            (const int16_t *)values.data(),
            (const int16_t *)start.data()
        );
    }
    t = clock() - t;
    printf ("AddRel_simple took %f s.\n",((float)t)/CLOCKS_PER_SEC);

    fSum.resize(fNumChannels * 1024);
    fSum2.resize(fNumChannels * 1024);
    t = clock();
    for(int i=0; i<REPS; i++){
        AddRel_complex(
            (const int16_t *)values.data(),
            (const int16_t *)start.data()
        );
    }
    t = clock() - t;
    printf ("AddRel_complex took %f s.\n",((float)t)/CLOCKS_PER_SEC);

}
