Context Navigation

← Previous Change
Next Change →

mcore

Timestamp:

06/01/13 19:55:16 (12 years ago)

Author:

tbretz

Message:

Performance improved and heavily tested version of the DRS calibration algorithms.

File:

: 1 edited

trunk/Mars/mcore/DrsCalib.h (modified) (13 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/Mars/mcore/DrsCalib.h

-              r16428
+              r16561
     void AddRel(const int16_t *val, const int16_t *start)
+    {
+        /*
         for (size_t ch=0; ch<fNumChannels; ch++)
+        {
             const int16_t spos = start[ch];
+            const int16_t &spos = start[ch];
             if (spos<0)
                 continue;
             const size_t pos = ch*1024;
             for (size_t i=0; i<1024; i++)
+            {
 …
                 fSum2[abs] += v*v;
+            }
+        */
+        // This version is 2.5 times faster because the compilers optimization
+        // is not biased by the evaluation of %1024
+        for (size_t ch=0; ch<fNumChannels; ch++)
+        {
+            const int16_t &spos = start[ch];
+            if (spos<0)
+                continue;
+            const size_t pos = ch*1024;
+            const int16_t *pval    = val + pos;
+            const int16_t *end_val = val + 1024;
+            int64_t *beg_sum  = fSum.data()  + pos;
+            int64_t *beg_sum2 = fSum2.data() + pos;
+            int64_t *psum  = beg_sum  + spos;
+            int64_t *psum2 = beg_sum2 + spos;
+            while (psum<beg_sum+1024)
+            {
+                const int64_t v = *pval++;
+                *psum++  = v;
+                *psum2++ = v*v;
+            }
+            psum  = beg_sum;
+            psum2 = beg_sum2;
+            while (pval<end_val)
+            {
+                const int64_t v = *pval++;
+                *psum++  = v;
+                *psum2++ = v*v;
+            }
+        }
 …
                 const int32_t *offset, const uint32_t scale)
+    {
+        /*
         for (size_t ch=0; ch<fNumChannels; ch++)
+        {
 …
                 continue;
             const size_t pos = ch*fNumSamples;
+            const size_t pos = ch*1024;
             for (size_t i=0; i<fNumSamples; i++)
 …
                 // Abs is corresponding index relative to DRS pipeline
                 const size_t rel = pos +  i;
                 const size_t abs = pos + (spos+i)%fNumSamples;
+                const size_t abs = pos + (spos+i)%1024;
                 const int64_t v = int64_t(val[rel])*scale-offset[abs];
 …
                 fSum[abs]  += v;
                 fSum2[abs] += v*v;
+            }
+        }*/
+        // This version is 2.5 times faster because the compilers optimization
+        // is not biased by the evaluation of %1024
+        for (size_t ch=0; ch<fNumChannels; ch++)
+        {
+            const int16_t &spos = start[ch];
+            if (spos<0)
+                continue;
+            const size_t pos = ch*1024;
+            const int16_t *pval    = val + pos;
+            const int16_t *end_val = val + 1024;
+            const int32_t *beg_offset = offset + pos;
+            const int32_t *poffset    = beg_offset + spos;
+            int64_t *beg_sum  = fSum.data()  + pos;
+            int64_t *beg_sum2 = fSum2.data() + pos;
+            int64_t *psum     = beg_sum    + spos;
+            int64_t *psum2    = beg_sum2   + spos;
+            while (psum<beg_sum+1024)
+            {
+                const int64_t v = int64_t(*pval++)*scale - *poffset++;
+                *psum++  = v;
+                *psum2++ = v*v;
+            }
+            psum    = beg_sum;
+            psum2   = beg_sum2;
+            poffset = beg_offset;
+            while (pval<end_val)
+            {
+                const int64_t v = int64_t(*pval++)*scale - *poffset++;
+                *psum++  = v;
+                *psum2++ = v*v;
+            }
+        }
 …
                 // Abs is corresponding index relative to DRS pipeline
                 const size_t rel = pos +  i;
                 const size_t abs = pos + (spos+i)%fNumSamples;
+                const size_t abs = pos + (spos+i)%1024;
                 const int64_t v = int64_t(val[rel])*scale-offset[abs];
 …
                 const int32_t *offset, const uint32_t scale)
+    {
+        /*
         // 1440 without tm, 1600 with tm
         for (size_t ch=0; ch<fNumChannels; ch++)
 …
                 fSum2[rel] += v*v;
+            }
+        }*/
+        // This version is 1.5 times faster because the compilers optimization
+        // is not biased by the evaluation of %1024
+        for (size_t ch=0; ch<fNumChannels; ch++)
+        {
+            const int16_t &spos = start[ch];
+            if (spos<0)
+                continue;
+            const size_t pos = ch*fNumSamples;
+            const int16_t *pval = val + pos;
+            const int32_t *beg_offset = offset + ch*1024;
+            const int32_t *poffset    = beg_offset + spos;
+            int64_t *beg_sum  = fSum.data()  + pos;
+            int64_t *beg_sum2 = fSum2.data() + pos;
+            int64_t *psum     = beg_sum;
+            int64_t *psum2    = beg_sum2;
+            if (spos+fNumSamples>1024)
+            {
+                while (poffset<beg_offset+1024)
+                {
+                    const int64_t v = int64_t(*pval++)*scale - *poffset++;
+                    *psum++  = v;
+                    *psum2++ = v*v;
+                }
+                poffset = beg_offset;
+            }
+            while (psum<beg_sum+fNumSamples)
+            {
+                const int64_t v = int64_t(*pval++)*scale - *poffset++;
+                *psum++  = v;
+                *psum2++ = v*v;
+            }
+        }
 …
+        }
+        /*
         for (size_t i=0; i<roi; i++)
+        {
 …
             const int64_t div = gain[abs];
             vec[i] = div==0 ? 0 : double(v)*scalegain/div;
+        }*/
+        // This version is faster because the compilers optimization
+        // is not biased by the evaluation of %1024
+        // (Here we are dominated by numerics... improvement ~10%)
+        const int32_t *poffset = offset + start;
+        const int64_t *pgain   = gain   + start;
+        const int16_t *pval    = val;
+        float *pvec = vec;
+        if (start+roi>1024)
+        {
+            while (poffset<offset+1024)
+            {
+                const int64_t v =
+                    + int64_t(*pval++)*scaleabs - *poffset++
+                    ;
+                *pvec++ = *pgain==0 ? 0 : double(v)*scalegain / *pgain;
+                pgain++;
+            }
+            poffset = offset;
+            pgain   = gain;
+        }
+        while (pvec<vec+roi)
+        {
+            const int64_t v =
+                + int64_t(*pval++)*scaleabs - *poffset++
+                ;
+            *pvec++ = *pgain==0 ? 0 : double(v)*scalegain / *pgain;
+            pgain++;
+        }
+    }
 …
             return;
+        }
+        /*
         for (size_t i=0; i<roi; i++)
+        {
 …
             const int64_t div = gain[abs]*scalerel;
             vec[i] = div==0 ? 0 : double(v)*scalegain/div;
+        }*/
+        // (Here we are dominated by numerics... improvement ~10%)
+        const int32_t *poffset = offset + start;
+        const int64_t *pgain   = gain   + start;
+        const int16_t *pval    = val;
+        const int64_t *ptrgoff = trgoff;
+        float *pvec = vec;
+        if (start+roi>1024)
+        {
+            while (poffset<offset+1024)
+            {
+                const int64_t v =
+                    + (int64_t(*pval++)*scaleabs - *poffset++)*scalerel
+                    - *ptrgoff++;
+                ;
+                const int64_t div = *pgain * scalerel;
+                *pvec++ = div==0 ? 0 : double(v)*scalegain / div;
+                pgain++;
+            }
+            poffset = offset;
+            pgain   = gain;
+        }
+        while (pvec<vec+roi)
+        {
+            const int64_t v =
+                + (int64_t(*pval++)*scaleabs - *poffset++)*scalerel
+                - *ptrgoff++;
+            ;
+            const int64_t div = *pgain * scalerel;
+            *pvec++ = div==0 ? 0 : double(v)*scalegain / div;
+            pgain++;
+        }
+    }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 16561 for trunk/Mars/mcore

Legend:

trunk/Mars/mcore/DrsCalib.h

Download in other formats: