Index: trunk/Mars/mcore/DrsCalib.h
===================================================================
--- trunk/Mars/mcore/DrsCalib.h	(revision 16560)
+++ trunk/Mars/mcore/DrsCalib.h	(revision 16561)
@@ -51,12 +51,12 @@
     void AddRel(const int16_t *val, const int16_t *start)
     {
+        /*
         for (size_t ch=0; ch<fNumChannels; ch++)
         {
-            const int16_t spos = start[ch];
+            const int16_t &spos = start[ch];
             if (spos<0)
                 continue;
 
             const size_t pos = ch*1024;
-
             for (size_t i=0; i<1024; i++)
             {
@@ -71,4 +71,43 @@
                 fSum2[abs] += v*v;
             }
+        */
+
+        // This version is 2.5 times faster because the compilers optimization
+        // is not biased by the evaluation of %1024
+        for (size_t ch=0; ch<fNumChannels; ch++)
+        {
+            const int16_t &spos = start[ch];
+            if (spos<0)
+                continue;
+
+            const size_t pos = ch*1024;
+
+            const int16_t *pval    = val + pos;
+            const int16_t *end_val = val + 1024;
+
+            int64_t *beg_sum  = fSum.data()  + pos;
+            int64_t *beg_sum2 = fSum2.data() + pos;
+
+            int64_t *psum  = beg_sum  + spos;
+            int64_t *psum2 = beg_sum2 + spos;
+
+            while (psum<beg_sum+1024)
+            {
+                const int64_t v = *pval++;
+
+                *psum++  = v;
+                *psum2++ = v*v;
+            }
+
+            psum  = beg_sum;
+            psum2 = beg_sum2;
+
+            while (pval<end_val)
+            {
+                const int64_t v = *pval++;
+
+                *psum++  = v;
+                *psum2++ = v*v;
+            }
         }
 
@@ -79,4 +118,5 @@
                 const int32_t *offset, const uint32_t scale)
     {
+        /*
         for (size_t ch=0; ch<fNumChannels; ch++)
         {
@@ -85,5 +125,5 @@
                 continue;
 
-            const size_t pos = ch*fNumSamples;
+            const size_t pos = ch*1024;
 
             for (size_t i=0; i<fNumSamples; i++)
@@ -93,5 +133,5 @@
                 // Abs is corresponding index relative to DRS pipeline
                 const size_t rel = pos +  i;
-                const size_t abs = pos + (spos+i)%fNumSamples;
+                const size_t abs = pos + (spos+i)%1024;
 
                 const int64_t v = int64_t(val[rel])*scale-offset[abs];
@@ -99,4 +139,47 @@
                 fSum[abs]  += v;
                 fSum2[abs] += v*v;
+            }
+        }*/
+
+        // This version is 2.5 times faster because the compilers optimization
+        // is not biased by the evaluation of %1024
+        for (size_t ch=0; ch<fNumChannels; ch++)
+        {
+            const int16_t &spos = start[ch];
+            if (spos<0)
+                continue;
+
+            const size_t pos = ch*1024;
+
+            const int16_t *pval    = val + pos;
+            const int16_t *end_val = val + 1024;
+
+            const int32_t *beg_offset = offset + pos;
+            const int32_t *poffset    = beg_offset + spos;
+
+            int64_t *beg_sum  = fSum.data()  + pos;
+            int64_t *beg_sum2 = fSum2.data() + pos;
+
+            int64_t *psum     = beg_sum    + spos;
+            int64_t *psum2    = beg_sum2   + spos;
+
+            while (psum<beg_sum+1024)
+            {
+                const int64_t v = int64_t(*pval++)*scale - *poffset++;
+
+                *psum++  = v;
+                *psum2++ = v*v;
+            }
+
+            psum    = beg_sum;
+            psum2   = beg_sum2;
+            poffset = beg_offset;
+
+            while (pval<end_val)
+            {
+                const int64_t v = int64_t(*pval++)*scale - *poffset++;
+
+                *psum++  = v;
+                *psum2++ = v*v;
             }
         }
@@ -122,5 +205,5 @@
                 // Abs is corresponding index relative to DRS pipeline
                 const size_t rel = pos +  i;
-                const size_t abs = pos + (spos+i)%fNumSamples;
+                const size_t abs = pos + (spos+i)%1024;
 
                 const int64_t v = int64_t(val[rel])*scale-offset[abs];
@@ -137,4 +220,5 @@
                 const int32_t *offset, const uint32_t scale)
     {
+        /*
         // 1440 without tm, 1600 with tm
         for (size_t ch=0; ch<fNumChannels; ch++)
@@ -160,4 +244,47 @@
                 fSum2[rel] += v*v;
             }
+        }*/
+
+        // This version is 1.5 times faster because the compilers optimization
+        // is not biased by the evaluation of %1024
+        for (size_t ch=0; ch<fNumChannels; ch++)
+        {
+            const int16_t &spos = start[ch];
+            if (spos<0)
+                continue;
+
+            const size_t pos = ch*fNumSamples;
+
+            const int16_t *pval = val + pos;
+
+            const int32_t *beg_offset = offset + ch*1024;
+            const int32_t *poffset    = beg_offset + spos;
+
+            int64_t *beg_sum  = fSum.data()  + pos;
+            int64_t *beg_sum2 = fSum2.data() + pos;
+
+            int64_t *psum     = beg_sum;
+            int64_t *psum2    = beg_sum2;
+
+            if (spos+fNumSamples>1024)
+            {
+                while (poffset<beg_offset+1024)
+                {
+                    const int64_t v = int64_t(*pval++)*scale - *poffset++;
+
+                    *psum++  = v;
+                    *psum2++ = v*v;
+                }
+
+                poffset = beg_offset;
+            }
+
+            while (psum<beg_sum+fNumSamples)
+            {
+                const int64_t v = int64_t(*pval++)*scale - *poffset++;
+
+                *psum++  = v;
+                *psum2++ = v*v;
+            }
         }
 
@@ -176,4 +303,5 @@
         }
 
+        /*
         for (size_t i=0; i<roi; i++)
         {
@@ -189,4 +317,41 @@
             const int64_t div = gain[abs];
             vec[i] = div==0 ? 0 : double(v)*scalegain/div;
+        }*/
+
+        // This version is faster because the compilers optimization
+        // is not biased by the evaluation of %1024
+        // (Here we are dominated by numerics... improvement ~10%)
+        const int32_t *poffset = offset + start;
+        const int64_t *pgain   = gain   + start;
+        const int16_t *pval    = val;
+
+        float *pvec = vec;
+
+        if (start+roi>1024)
+        {
+            while (poffset<offset+1024)
+            {
+                const int64_t v =
+                    + int64_t(*pval++)*scaleabs - *poffset++
+                    ;
+
+                *pvec++ = *pgain==0 ? 0 : double(v)*scalegain / *pgain;
+
+                pgain++;
+            }
+
+            poffset = offset;
+            pgain   = gain;
+        }
+
+        while (pvec<vec+roi)
+        {
+            const int64_t v =
+                + int64_t(*pval++)*scaleabs - *poffset++
+                ;
+
+            *pvec++ = *pgain==0 ? 0 : double(v)*scalegain / *pgain;
+
+            pgain++;
         }
     }
@@ -202,5 +367,5 @@
             return;
         }
-
+        /*
         for (size_t i=0; i<roi; i++)
         {
@@ -217,4 +382,44 @@
             const int64_t div = gain[abs]*scalerel;
             vec[i] = div==0 ? 0 : double(v)*scalegain/div;
+        }*/
+
+        // (Here we are dominated by numerics... improvement ~10%)
+        const int32_t *poffset = offset + start;
+        const int64_t *pgain   = gain   + start;
+        const int16_t *pval    = val;
+        const int64_t *ptrgoff = trgoff;
+
+        float *pvec = vec;
+
+        if (start+roi>1024)
+        {
+            while (poffset<offset+1024)
+            {
+                const int64_t v =
+                    + (int64_t(*pval++)*scaleabs - *poffset++)*scalerel
+                    - *ptrgoff++;
+                ;
+
+                const int64_t div = *pgain * scalerel;
+                *pvec++ = div==0 ? 0 : double(v)*scalegain / div;
+
+                pgain++;
+            }
+
+            poffset = offset;
+            pgain   = gain;
+        }
+
+        while (pvec<vec+roi)
+        {
+            const int64_t v =
+                + (int64_t(*pval++)*scaleabs - *poffset++)*scalerel
+                - *ptrgoff++;
+            ;
+
+            const int64_t div = *pgain * scalerel;
+            *pvec++ = div==0 ? 0 : double(v)*scalegain / div;
+
+            pgain++;
         }
     }
