summaryrefslogtreecommitdiff
path: root/quad_reg.awk
blob: 1b30afd66e3d07c95345c6b3e0cf2c788a60ba48 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/awk -f

### quad_reg.awk
# simple linear regression between columns

BEGIN {
    OFMT="%.9g"
    sign = "[+-]?"
    decimal = "[0-9]+[.]?[0-9]*"
    fraction = "[.][0-9]*"
    exponent = "([Ee]" sign "[0-9]+)?"
    number = "^" sign "(" decimal "|" fraction ")" exponent "$"
}

NR == 1 {
    for (n=1; n<=NF; n++)
        ($n ~ number) ? header[n] = "col" n : header[n] = $n
}

NF > 0 {
    if (NF > nf_max)
        nf_max = NF

    ### iterate over columns
    for (y=1; y<=nf_max; y++) {
        if ($y ~ number) {

            ### mean
            count[y] += 1
            sum[y] += $y
            sum2[y] += $y*$y
            sum3[y] += $y*$y*$y
            sum4[y] += $y*$y*$y*$y
            mean[y] = sum[y]/count[y]
            mean2[y] = sum2[y]/count[y]

            ### difference from the mean
            delta[y] = $y - mean[y]
            sum_delta[y] += delta[y]
            sum_delta2[y] += delta[y]*delta[y]

            ### sample variance
            (count[y] > 1) ? var[y] = sum_delta2[y]/(count[y] - 1) : var[y] = ""

            # x = row, y = col, trendline: y = A + Bx + Cx^2
            for (x=1; x<=nf_max; x++) {
                count[x,y] += 1
                sum_xy[x,y] += $x*$y
                sum_x2y[x,y] += $x*$x*$y
                sum_delta_xy[x,y] += delta[x]*delta[y]

                # covariances
                if (count[x,y] > 1) {
                    s_xx[x,y] = sum2[x]/(count[x,y]) - mean[x]*mean[x]
                    s_xy[x,y] = sum_xy[x,y]/(count[x,y]) - mean[x]*mean[y]
                    s_xx2[x,y] = sum3[x]/(count[x,y]) - mean[x]*mean2[x]
                    s_x2x2[x,y] = sum4[x]/(count[x,y]) - mean2[x]*mean2[x]
                    s_x2y[x,y] = sum_x2y[x]/(count[x,y]) - mean2[x]*mean[y]
                }

                bc_den[x,y] = (s_xx[x,y]*s_x2x2[x,y] - s_xx2[x,y]*s_xx2[x,y])
                if (bc_den[x,y]) {
                    c[x,y] = (s_x2y[x,y]*s_xx[x,y] - s_xy[x,y]*s_xx2[x,y])/bc_den[x,y]
                    b[x,y] = (s_xy[x,y]*s_x2x2[x,y] - s_x2y[x,y]*s_xx2[x,y])/bc_den[x,y]
                }
                else {
                    c[x,y] = 0
                    b[x,y] = 0
                }
                a[x,y] = mean[y] - b[x,y]*mean[x] - c[x,y]*mean[x]*mean[x]

                # error estimate
                err[x,y] = ($y - (a[x,y] + b[x,y]*$x + c[x,y]*$x*$x))
                sum_err2[x,y] += err[x,y]*err[x,y]

                # correlation
                sum_delta2[y] ? r2[x,y] = sum_err2[x,y]/sum_delta2[y] : r2[x,y] = 1
            }
        }
        else
            continue
    }
}

END {
    for (x=1; x<=nf_max; x++) {
        for (y=1; y<=nf_max; y++) {
            if (x != y && r2[x,y]) {
                printf(OFMT OFS "(%s)" OFS " = (" OFMT ")(%s)^2" OFS " + (" OFMT ")(%s)" OFS " + (" OFMT ")" ORS,
                    10.0*log(r2[x,y])/log(10), header[y], c[x,y], header[x], b[x,y], header[x], a[x,y])
            }
        }
    }
}