Changes between Version 93 and Version 94 of DatabaseBasedAnalysis


Ignore:
Timestamp:
08/08/18 09:47:37 (6 years ago)
Author:
tbretz
Comment:

--

Legend:

Unmodified
Added
Removed
Modified
  • DatabaseBasedAnalysis

    v93 v94  
    502502{{{#!sql
    503503SELECT
    504     Counter.*,
    505     `Signal` - `Background`/5      AS `Excess`,
    506     LiMa(`Signal`, `Background`/5) AS `Significance`
     504    *,
     505    `Signal` - `Background`      AS `Excess`,
     506    LiMa(`Signal`, `Background`) AS `Significance`
    507507FROM
    508508(
    509509
    510510    SELECT
    511         COUNT(IF(Weight>0, 1, NULL)) AS `Signal`,
    512         COUNT(IF(Weight<0, 1, NULL)) AS `Background`
     511        COUNT(IF(Weight>0, 1, NULL))   AS `Signal`,
     512        COUNT(IF(Weight<0, 1, NULL))/5 AS `Background`
    513513    FROM
    514514    (
     
    539539            @ThetaSq := (@Disp*@Disp + @Dist*@Dist - 2*@Disp*@Dist*SQRT(1-@LX*@LX)) AS ThetaSq
    540540        FROM RunInfo
    541         LEFT JOIN Events   USING (FileId)
    542         LEFT JOIN Position USING (FileId, EvtNumber)
    543         CROSS JOIN Wobble
    544         WHERE
    545             fSourceKey=5
    546         AND
    547             fRunTypeKey=1
    548         AND
     541        LEFT JOIN Events   USING (FileId)
     542        LEFT JOIN Position USING (FileId, EvtNumber)
     543        CROSS JOIN Wobble
     544        WHERE
     545
     546        /* Data Selection */
     547            fSourceKey=5
     548        AND
     549            fRunTypeKey=1
     550        AND
    549551            FileId BETWEEN 131101000 AND 131107000
    550552        AND
    551553            fZenithDistanceMax<35
    552554        AND
    553              fR750Cor>0.9*fR750Ref
    554     ) TableAlias
     555            fR750Cor>0.9*fR750Ref
     556        AND
     557
     558        /* Quality cuts */
     559            NumUsedPixels>5.5
     560        AND
     561            NumIslands<3.5
     562        AND
     563            Leakage1<0.1
     564
     565        /* Background suppression cut */
     566        HAVING
     567            Area < LOG10(Size)*898-1535
     568
     569    ) InnerTable
    555570
    556571    WHERE
     572        /* Theta-square cut */
    557573        ThetaSq<0.024
    558     AND
    559         Area < LOG10(Size)*898-1535
    560     AND
    561         NumUsedPixels>5.5
    562     AND
    563         NumIslands<3.5
    564     AND
    565         Leakage1<0.1
    566 
    567 ) Counter
     574
     575) OuterTable
    568576}}}
    569577
     
    573581| Signal | Background | Excess   | Significance      |
    574582+--------+------------+----------+-------------------+
    575 |    984 |       2205 | 543.0000 | 19.72239008502298 |
     583|    984 |   446.0000 | 538.0000 | 19.48657640549446 |
    576584+--------+------------+----------+-------------------+
    577 1 row in set (2 min 5.48 sec)
     5851 row in set (0 min 11.33 sec)
    578586}}}
    579587
     
    583591*************************** 1. row ***************************
    584592      Signal: 984
    585   Background: 2205
    586       Excess: 543.0000
    587 Significance: 19.72239008502298
    588 1 row in set (2 min 0.02 sec)
     593  Background: 446.0000
     594      Excess: 538.0000
     595Significance: 19.48657640549446
     5961 row in set (0 min 11.33 sec)
    589597}}}
    590598
    591599I am sure there is also a query which in addition prints the effective on-time.
     600
     601Note that for performance reasons it is (like in C++) very important that cuts are done as early as possible. Therefore, the quality-cuts are already done during event selection. Whether it is faster to have the area-cut in a `HAVING` clause or in the outer table in a `WHERE` clause might depend on the individual case (and how the optimizer handles it) and need to be tested. Note that a theta-square cut can not work in the HAVING clause because here, theta-square is only evaluated '''after''' the `HAVING` clause, i.e. just before the data is ''sent''.
    592602
    593603== WARNING: SQL User Variables ==
     
    610620SELECT -- 10
    611621    Counter.*,
    612     `Signal` - `Background`/5      AS `Excess`,
    613     LiMa(`Signal`, `Background`/5) AS `Significance`
     622    `Signal` - `Background`      AS `Excess`,
     623    LiMa(`Signal`, `Background`) AS `Significance`
    614624FROM
    615625( -- Counter
    616626
    617627    SELECT -- 9
    618         COUNT(IF(Weight>0, 1, NULL)) AS `Signal`,
    619         COUNT(IF(Weight<0, 1, NULL)) AS `Background`
     628        COUNT(IF(Weight>0, 1, NULL))   AS `Signal`,
     629        COUNT(IF(Weight<0, 1, NULL))/5 AS `Background`
    620630    FROM
    621631    ( -- Table8
     
    696706                                        CROSS JOIN Wobble
    697707                                        WHERE
     708
     709                                        /* Data selection */
    698710                                            fSourceKey=5
    699711                                        AND
     
    706718                                            fR750Cor>0.9*fR750Ref
    707719                                        AND
     720
     721                                        /* Quality cuts */
    708722                                            NumUsedPixels>5.5
    709723                                        AND
     
    717731
    718732                                    WHERE
     733
     734                                        /* Background suppression cut */
    719735                                        Area < LOG10(Size)*898-1535
    720736
     
    752768
    753769    WHERE
     770
     771        /* Theta-square cut */
    754772        ThetaSq<0.024
    755773
     
    761779}}}
    762780
    763 Now what makes the difference... first, execution time shows the power of MySQL. The old query leads:
    764 {{{
    765 Execution time: 122.145s
    766 }}}
    767 
    768781This new query results in
    769782{{{
    770 Execution time: 12.0002s
    771 }}}
     783Execution time: 17.3689s
     784}}}
     785
     786In this particular example, the `WHERE` clause for the background suppression cut turns out to be much (2x) faster than the `HAVING` clause, while in the previous example, it was the opposite.
    772787
    773788Note that this is the result of the analysis of a dataset which, as a whole, is more than 100GB and even reduced to Crab after quality cuts still is of the order of GB! As a comparison. The source code which does all the calculations in compiled C++ code takes about 4.5s on the same machine.
    774 
    775 Why is this query so much faster? I guess it has two reasons:
    776 
    777 1) Only variables which are really needed in the next step are kept (reduces memory consumption)
    778 2) Cuts are evaluated by the optimizer as soon as possible (e.g. the Area cuts might be evaluated even before the other values are loaded from disk)
    779 
    780 Another example (apart from the obvious difficulty to read the query) is that it is very well structured. For each step it is very well defined what values
    781 are available from the previous step (the fields which are requested in each SELECT statement) and it is very well defined in which order the algorithm is executed.
    782 
    783 If somebody find a way to write that query in an easier readable way, I would be glad to know.
    784789
    785790{{{#!div style="background: #d88; border: 3px ridge"
     
    801806So as a test, let's append a `e0` to all fractional values in the query, e.g. instead of `0.5`, let's write `0.5e0` (or `5e-1`). This essentially tells the SQL engine to store the value as an approximate  ''floating-point'' value and not as an ''exact'' value. Now, let's run the query again...
    802807
    803 And voila!
    804 
    805 {{{
    806 Execution time: 7.59051s
    807 }}}
    808 
    809 '' This is again much faster! '' (and now almost as fast as a pure C++ analysis assuming that the data is already available in a local file)
     808And voila! This is really faster:
     809
     810{{{
     811Execution time: 9.04737s
     812}}}
    810813
    811814{{{#!div style="background: #8d8; border: 3px ridge"
     
    867870                                            fZenithDistanceMax<35
    868871                                        AND
    869                                             fR750Cor>0.9*fR750Ref
     872                                            fR750Cor>0.9e0*fR750Ref
    870873                                        AND
    871874                                            NumUsedPixels>5.5
     
    879882                                    SELECT -- 1
    880883                                        Weight, CosDelta, SinDelta, M3Long, SlopeLong, Leakage1, WdivL,
    881                                         MeanX - PX/1.02 AS DX,
    882                                         MeanY - PY/1.02 AS DY
     884                                        MeanX - PX/1.02e0 AS DX,
     885                                        MeanY - PY/1.02e0 AS DY
    883886                                    FROM
    884887                                        Table0
    885888                                    WHERE
    886                                         Area < LOG10(Size)*898-1535
     889                                        Area < LOG10(Size)*898e0-1535e0
    887890
    888891                                ) -- AS Table1
     
    916919                    SELECT -- 5
    917920                        Weight, Leakage1, WdivL, LX,
    918                         Norm          *0.0117193246260285378 AS Dist,
    919                         M3Long   *Sign*0.0117193246260285378 AS M3L,
    920                         SlopeLong*Sign/0.0117193246260285378 AS Slope
     921                        Norm          *0.0117193246260285378e0 AS Dist,
     922                        M3Long   *Sign*0.0117193246260285378e0 AS M3L,
     923                        SlopeLong*Sign/0.0117193246260285378e0 AS Slope
    921924                    FROM
    922925                        Table4
     
    926929                SELECT -- 6
    927930                    Weight, WdivL, Dist, LX,
    928                     1.39252 + 0.154247*Slope + 1.67972*(1-1/(1+4.86232*Leakage1)) AS Xi,
    929                     M3L+0.07             AS Sign1,
    930                     (Dist-0.5)*7.2-Slope AS Sign2
     931                    1.39252e0 + 0.154247e0*Slope + 1.67972e0*(1-1/(1+4.86232e0*Leakage1)) AS Xi,
     932                    M3L+0.07e0             AS Sign1,
     933                    (Dist-0.5e0)*7.2e0-Slope AS Sign2
    931934                FROM
    932935                    Table5
     
    951954
    952955    SELECT -- 9
    953         COUNT(IF(Weight>0, 1, NULL)) AS `Signal`,
    954         COUNT(IF(Weight<0, 1, NULL)) AS `Background`
     956        COUNT(IF(Weight>0, 1, NULL))   AS `Signal`,
     957        COUNT(IF(Weight<0, 1, NULL))/5 AS `Background`
    955958    FROM
    956959        Table8
     
    961964SELECT -- 10
    962965    *,
    963     `Signal` - `Background`/5      AS `Excess`,
    964     LiMa(`Signal`, `Background`/5) AS `Significance`
     966    `Signal` - `Background`      AS `Excess`,
     967    LiMa(`Signal`, `Background`) AS `Significance`
    965968FROM
    966969    Table9
    967970}}}
    968971
    969 == MySQL 8 ==
    970 
    971 Note that after installing MySQL the performance generally got a bit better so that the difference between the queries became less pronounced. Unfortunately, the performance of the fastes query got slightly worse.
     972Here with floating-point numbers. The execution time is similar.
     973{{{
     974Execution time: 9.09697s
     975}}}
     976
     977
     978== Optimizing the query further ==
     979
     980By throwing out unnecessary calculations, this query can be optimized further
     981
     982{{{#!sql
     983WITH Table9 AS
     984(
     985    WITH Table8 AS
     986    (
     987        WITH Table7 AS
     988        (
     989            WITH Table6 AS
     990            (
     991                WITH Table5 AS
     992                (
     993                    WITH Table3 AS
     994                    (
     995                        WITH Table2 AS
     996                        (
     997                            WITH Table1 AS
     998                            (
     999                                WITH Table0 AS
     1000                                (
     1001                                    SELECT -- 0
     1002                                        Weight,
     1003                                        Size,
     1004                                        NumUsedPixels,
     1005                                        NumIslands,
     1006                                        Leakage1,
     1007                                        MeanX,
     1008                                        MeanY,
     1009                                        CosDelta,
     1010                                        SinDelta,
     1011                                        M3Long,
     1012                                        SlopeLong,
     1013                                        Width/Length      AS WdivL,
     1014                                        PI()*Width*Length AS Area,
     1015                                        cosa*X - sina*Y   AS PX,
     1016                                        cosa*Y + sina*X   AS PY
     1017                                    FROM RunInfo
     1018                                    LEFT JOIN Events   USING (FileId)
     1019                                    LEFT JOIN Position USING (FileId, EvtNumber)
     1020                                    CROSS JOIN Wobble
     1021                                    WHERE
     1022                                        fSourceKey=5
     1023                                    AND
     1024                                        fRunTypeKey=1
     1025                                    AND
     1026                                        FileId BETWEEN 131101000 AND 131107000
     1027                                    AND
     1028                                        fZenithDistanceMax<35
     1029                                    AND
     1030                                        fR750Cor>0.9e0*fR750Ref
     1031                                    AND
     1032                                        NumUsedPixels>5.5
     1033                                    AND
     1034                                        NumIslands<3.5
     1035                                    AND
     1036                                        Leakage1<0.1
     1037
     1038                                ) -- AS Table0
     1039
     1040                                SELECT -- 1
     1041                                    Weight, CosDelta, SinDelta, M3Long, SlopeLong, Leakage1, WdivL,
     1042                                    MeanX - PX/1.02e0 AS DX,
     1043                                    MeanY - PY/1.02e0 AS DY
     1044                                FROM
     1045                                    Table0
     1046                                WHERE
     1047                                    Area < LOG10(Size)*898e0-1535e0
     1048
     1049                            ) -- AS Table1
     1050
     1051                            SELECT -- 2
     1052                                Weight, CosDelta, SinDelta, DX, DY, M3Long, SlopeLong, Leakage1, WdivL,
     1053                                SQRT(DX*DX + DY*DY) AS Norm
     1054                            FROM
     1055                                Table1
     1056
     1057                        ) -- AS Table2
     1058
     1059                        SELECT -- 3
     1060                            Weight, M3Long, SlopeLong, Leakage1, WdivL, Norm,
     1061                            LEAST(GREATEST((CosDelta*DY - SinDelta*DX)/Norm, -1), 1) AS LX,
     1062                            SIGN(CosDelta*DX + SinDelta*DY) AS Sign
     1063                        FROM
     1064                            Table2
     1065
     1066                    ) -- AS Table3
     1067
     1068                    SELECT -- 5
     1069                        Weight, Leakage1, WdivL, LX,
     1070                        Norm          *0.0117193246260285378e0 AS Dist,
     1071                        M3Long   *Sign*0.0117193246260285378e0 AS M3L,
     1072                        SlopeLong*Sign/0.0117193246260285378e0 AS Slope
     1073                    FROM
     1074                        Table3
     1075
     1076                ) -- AS Table5
     1077
     1078                SELECT -- 6
     1079                    Weight, WdivL, Dist, LX, M3L, Slope,
     1080                    1.39252e0 + 0.154247e0*Slope + 1.67972e0*(1-1/(1+4.86232e0*Leakage1)) AS Xi
     1081                FROM
     1082                    Table5
     1083
     1084            ) -- AS Table6
     1085
     1086            SELECT -- 7
     1087                Weight, Dist, LX,
     1088                IF (M3L<-0.07 || (Dist-0.5e0)*7.2e0-Slope<0, -Xi, Xi) * (1-WdivL) AS Disp
     1089            FROM
     1090                Table6
     1091
     1092        ) -- AS Table7
     1093
     1094        SELECT -- 8
     1095            Weight,
     1096            (Disp*Disp + Dist*Dist - 2*Disp*Dist*SQRT(1-LX*LX)) AS ThetaSq
     1097        FROM
     1098            Table7
     1099
     1100    ) -- AS Table8
     1101
     1102    SELECT -- 9
     1103        COUNT(IF(Weight>0, 1, NULL))   AS `Signal`,
     1104        COUNT(IF(Weight<0, 1, NULL))/5 AS `Background`
     1105    FROM
     1106        Table8
     1107    WHERE
     1108        ThetaSq<0.024
     1109) -- AS Table9
     1110
     1111SELECT -- 10
     1112    *,
     1113    `Signal` - `Background`      AS `Excess`,
     1114    LiMa(`Signal`, `Background`) AS `Significance`
     1115FROM
     1116    Table9
     1117}}}
     1118
     1119With
     1120
     1121{{{
     1122Execution time: 6.75859s
     1123}}}
     1124
     1125Almost as fast as the 4.5s with C++ where the C++ compiler optimizes the maths. The difference is most likely overhead for example from the SQL optimizer.