File Coverage

File:LikeR.xs
Coverage:77.4%

linestmtbrancondsubtimecode
1/* --- C HELPER SECTION --- */
2#define PERL_NO_GET_CONTEXT
3#include "EXTERN.h"
4#include "perl.h"
5#include "XSUB.h"
6#include "ppport.h"
7#include <math.h>
8#include <ctype.h>
9#include <stdlib.h>
10#include <float.h>
11#include <string.h>
12/* Ensure Perl's PRNG is seeded, matching the lazy-evaluation of Perl's rand() */
13#define AUTO_SEED_PRNG() \
14    do { \
15        if (!PL_srand_called) { \
16            (void)seedDrand01((Rand_seed_t)Perl_seed(aTHX)); \
17            PL_srand_called = TRUE; \
18        } \
19    } while (0)
20
21// ---------------------------------------
22//   Helpers for Random Number Generation
23// ---------------------------------------
24#ifndef M_PI
25#define M_PI 3.14159265358979323846
26#endif
27// C helper for EXACT Non-central T-distribution CDF via Numerical Integration.
28// This perfectly replicates R's pt(..., ncp) exactness without requiring complex Beta functions.
29
687
static double exact_pnt(double t, double df, double ncp) {
30
687
        if (df <= 0.0) return 0.0;
31
32
687
        unsigned short int n_steps = 30000;
33
687
        double step = 1.0 / n_steps;
34
687
        double integral = 0.0;
35
687
        double half_df = df / 2.0;
36
37
687
        double log_coef = log(2.0) + half_df * log(half_df) - lgamma(half_df);
38
687
        double root_half = 0.70710678118654752440; // 1 / sqrt(2)
39
40
20610000
        for (unsigned short i = 1; i < n_steps; i++) {
41
20609313
                double u = i * step;
42
20609313
                double w = u / (1.0 - u);
43
44                // Scaled Chi-distribution log-density
45
20609313
                double log_M = log_coef + (df - 1.0) * log(w) - half_df * w * w;
46
20609313
                double M = exp(log_M);
47
48                // Exact Normal CDF using the C standard library's erfc function
49
20609313
                double z = t * w - ncp;
50
20609313
                double pnorm_val = 0.5 * erfc(-z * root_half);
51
52
20609313
                double weight = (i % 2 != 0) ? 4.0 : 2.0;
53
20609313
                integral += weight * (pnorm_val * M / ((1.0 - u) * (1.0 - u)));
54        }
55
56
687
        return integral * (step / 3.0);
57}
58// --- Math Helpers for P-values and Confidence Intervals ---
59
60// Ranking helper with tie adjustment (matches R's tie handling)
61typedef struct { double val; size_t idx; double rank; } RankInfo;
62
33
static int compare_rank(const void *restrict a, const void *restrict b) {
63
33
        double diff = ((RankInfo*)a)->val - ((RankInfo*)b)->val;
64
33
        return (diff > 0) - (diff < 0);
65}
66
67
33
static int compare_index(const void *restrict a, const void *restrict b) {
68
33
        return ((RankInfo*)a)->idx - ((RankInfo*)b)->idx;
69}
70
71
6
static void compute_ranks(double *restrict data, double *restrict ranks, size_t n) {
72
6
        RankInfo *restrict items = safemalloc(n * sizeof(RankInfo));
73
36
        for (size_t i = 0; i < n; i++) {
74
30
                items[i].val = data[i];
75
30
                items[i].idx = i;
76        }
77
6
        qsort(items, n, sizeof(RankInfo), compare_rank);
78        // Handle ties by averaging ranks
79
36
        for (size_t i = 0; i < n; ) {
80
30
                size_t j = i + 1;
81
30
                while (j < n && items[j].val == items[i].val) j++;
82
30
                double avg_rank = (i + 1 + j) / 2.0;
83
60
                for (size_t k = i; k < j; k++) items[k].rank = avg_rank;
84
30
                i = j;
85        }
86
6
        qsort(items, n, sizeof(RankInfo), compare_index);
87
36
        for (size_t i = 0; i < n; i++) ranks[i] = items[i].rank;
88
6
        Safefree(items);
89
6
}
90// Generates a single binomial random variate.
91//Uses the standard Bernoulli trial loop. Drand01() taps into Perl's PRNG.
92
61497
static size_t generate_binomial(const size_t size, const double prob) {
93
61497
        if (prob <= 0.0) return 0;
94
61197
        if (prob >= 1.0) return size;
95
96
60897
        size_t successes = 0;
97
936870
        for (size_t i = 0; i < size; i++) {
98
875973
                if (Drand01() <= prob) successes++;
99        }
100
60897
        return successes;
101}
102// Helper: log combination
103
426
static double log_choose(size_t n, size_t k) {
104
426
        return lgamma((double)n + 1.0) - lgamma((double)k + 1.0) - lgamma((double)(n - k) + 1.0);
105}
106
107// Log-space tails for non-central hypergeometric
108
1212
static void calc_tails_logspace(size_t a, size_t min_x, size_t max_x, double omega, const double *logdc, double *restrict lower_tail, double *restrict upper_tail) {
109
1212
        double max_d = -1e300, log_omega = log(omega);
110
111
10056
        for(size_t k = 0; k <= max_x - min_x; ++k) {
112
8844
          double d_val = logdc[k] + log_omega * (min_x + k);
113
8844
          if (d_val > max_d) max_d = d_val;
114        }
115
116
1212
        double sum_d = 0.0;
117
10056
        for(size_t k = 0; k <= max_x - min_x; ++k) {
118
8844
          sum_d += exp(logdc[k] + log_omega * (min_x + k) - max_d);
119        }
120
121
1212
        *lower_tail = 0.0;
122
1212
        *upper_tail = 0.0;
123
124
10056
        for(size_t k = 0; k <= max_x - min_x; ++k) {
125
8844
          double p_prob = exp(logdc[k] + log_omega * (min_x + k) - max_d) / sum_d;
126
8844
          if (min_x + k <= a) *lower_tail += p_prob;
127
8844
          if (min_x + k >= a) *upper_tail += p_prob;
128        }
129
1212
}
130
131// Exact stats using log-space
132
15
static void calculate_exact_stats(size_t a, size_t b, size_t c, size_t d, double conf_level, const char*restrict alt, double *restrict mle_or, double *restrict ci_low, double *restrict ci_high) {
133
15
    double alpha = 1.0 - conf_level;
134
15
    size_t r1 = a + b, r2 = c + d, c1 = a + c;
135
15
    size_t min_x = (r2 > c1) ? 0 : c1 - r2;
136
15
    size_t max_x = (r1 < c1) ? r1 : c1;
137
138
15
    bool is_less = (strcmp(alt, "less") == 0);
139
15
    bool is_greater = (strcmp(alt, "greater") == 0);
140
141
15
    double *restrict logdc = (double*)safemalloc((max_x - min_x + 1) * sizeof(double));
142
15
    double denom = log_choose(r1 + r2, c1);
143
114
    for(size_t x = min_x; x <= max_x; ++x) {
144
99
        logdc[x - min_x] = log_choose(r1, x) + log_choose(r2, c1 - x) - denom;
145    }
146
147    // MLE
148
15
    if (a == min_x && a == max_x) *mle_or = 1.0;
149
15
    else if (a == min_x) *mle_or = 0.0;
150
15
    else if (a == max_x) *mle_or = INFINITY;
151    else {
152
12
        double log_low = -100.0, log_high = 100.0;
153
696
        for (unsigned short int i = 0; i < 3000; i++) {
154
696
            double log_mid = 0.5 * (log_low + log_high);
155
696
            double max_d = -1e300;
156
5568
            for(size_t k = 0; k <= max_x - min_x; ++k) {
157
4872
                double d_val = logdc[k] + log_mid * (min_x + k);
158
4872
                if (d_val > max_d) max_d = d_val;
159            }
160
696
            double sum_d = 0.0, exp_val = 0.0;
161
5568
            for(size_t k = 0; k <= max_x - min_x; ++k) {
162
4872
                double p_prob = exp(logdc[k] + log_mid * (min_x + k) - max_d);
163
4872
                sum_d += p_prob;
164
4872
                exp_val += (min_x + k) * p_prob;
165            }
166
696
            exp_val /= sum_d;
167
168
696
            if (exp_val > a) log_high = log_mid;
169
276
            else log_low = log_mid;
170
696
            if (log_high - log_low < 1e-15) break;
171        }
172
12
        *mle_or = exp(0.5 * (log_low + log_high));
173    }
174
175
15
    *ci_low = 0.0;
176
15
    *ci_high = INFINITY;
177
178    // Lower CI
179
15
    if (!is_less) {
180
12
        double target_alpha = is_greater ? alpha : alpha / 2.0;
181
12
        if (a != min_x) {
182
12
            double log_low = -100.0, log_high = 100.0, best = 1.0, best_err = 1e9, lt, ut;
183
696
            for (unsigned short int i = 0; i < 1000; i++) {
184
696
                double log_mid = 0.5 * (log_low + log_high);
185
696
                double mid = exp(log_mid);
186
696
                calc_tails_logspace(a, min_x, max_x, mid, logdc, &lt, &ut);
187
696
                double err = fabs(ut - target_alpha);
188
696
                if (err < best_err) { best_err = err; best = mid; }
189
696
                if (ut > target_alpha) log_high = log_mid;
190
318
                else log_low = log_mid;
191
696
                if (log_high - log_low < 1e-15) break;
192            }
193
12
            *ci_low = best;
194        }
195    }
196
197    // Upper CI
198
15
    if (!is_greater) {
199
12
        double target_alpha = is_less ? alpha : alpha / 2.0;
200
12
        if (a != max_x) {
201
9
            double log_low = -100.0, log_high = 100.0, best = 1.0, best_err = 1e9, lt, ut;
202
516
            for (unsigned short int i = 0; i < 1000; i++) {
203
516
                double log_mid = 0.5 * (log_low + log_high);
204
516
                double mid = exp(log_mid);
205
516
                calc_tails_logspace(a, min_x, max_x, mid, logdc, &lt, &ut);
206
516
                double err = fabs(lt - target_alpha);
207
516
                if (err < best_err) { best_err = err; best = mid; }
208
516
                if (lt > target_alpha) log_low = log_mid;
209
264
                else log_high = log_mid;
210
516
                if (log_high - log_low < 1e-15) break;
211            }
212
9
            *ci_high = best;
213        }
214    }
215
15
    safefree(logdc);
216
15
}
217
218// Exact p-value using log-space
219
15
static double exact_p_value(size_t a, size_t b, size_t c, size_t d, const char* alt) {
220
15
    size_t r1 = a + b, r2 = c + d, c1 = a + c;
221
15
    size_t min_x = (r2 > c1) ? 0 : c1 - r2;
222
15
    size_t max_x = (r1 < c1) ? r1 : c1;
223
224
15
    double *logdc = (double*)safemalloc((max_x - min_x + 1) * sizeof(double));
225
15
    double denom = log_choose(r1 + r2, c1);
226
114
    for(size_t x = min_x; x <= max_x; ++x) {
227
99
        logdc[x - min_x] = log_choose(r1, x) + log_choose(r2, c1 - x) - denom;
228    }
229
230
15
    double p_val = 0.0;
231
232
15
    if (strcmp(alt, "less") == 0) {
233
15
        for(size_t x = min_x; x <= a; ++x) p_val += exp(logdc[x - min_x]);
234
12
    } else if (strcmp(alt, "greater") == 0) {
235
9
        for(size_t x = a; x <= max_x; ++x) p_val += exp(logdc[x - min_x]);
236    } else {
237
9
        double p_obs = exp(logdc[a - min_x]);
238
9
        double relErr = 1.0 + 1e-7;
239
78
        for(size_t x = min_x; x <= max_x; ++x) {
240
69
            double p_cur = exp(logdc[x - min_x]);
241
69
            if (p_cur <= p_obs * relErr) p_val += p_cur;
242        }
243    }
244
245
15
    safefree(logdc);
246
15
    return (p_val > 1.0) ? 1.0 : p_val;
247}
248/* -----------------------------------------------------------------------
249 * Helpers for lm Linear Regression: OLS Matrix Math & Formula Parsing
250 * ----------------------------------------------------------------------- */
251
252/* Sweep operator for symmetric positive-definite matrices (e.g., XtX).
253 * This gracefully handles collinearity by bypassing aliased columns.
254 * Utilizes a relative tolerance check to prevent dropping micro-variance features.
255 */
256
132
static int sweep_matrix_ols(double *restrict A, size_t n, bool *restrict aliased) {
257
132
        int rank = 0;
258
132
        double *restrict orig_diag = (double*)safemalloc(n * sizeof(double));
259
260        // Save the original diagonal values to use as a baseline for relative variance
261
510
        for (size_t k = 0; k < n; k++) {
262
378
                aliased[k] = false;
263
378
                orig_diag[k] = A[k * n + k];
264        }
265
266
510
        for (size_t k = 0; k < n; k++) {
267                // Check pivot for collinearity using a RELATIVE tolerance
268                // (Fallback to a tiny absolute tolerance of 1e-24 to catch literal zero vectors)
269
378
                if (fabs(A[k * n + k]) <= 1e-10 * orig_diag[k] || fabs(A[k * n + k]) < 1e-24) {
270
3
                        aliased[k] = true;
271                        // Isolate this column so it doesn't affect the rest of the matrix
272
12
                        for (size_t i = 0; i < n; i++) {
273
9
                                A[k * n + i] = 0.0;
274
9
                                A[i * n + k] = 0.0;
275                        }
276
3
                        continue;
277                }
278
375
                rank++;
279
375
                double pivot = 1.0 / A[k * n + k];
280
375
                A[k * n + k] = 1.0;
281
1476
                for (size_t j = 0; j < n; j++) A[k * n + j] *= pivot;
282
1476
                for (size_t i = 0; i < n; i++) {
283
1101
                        if (i != k && A[i * n + k] != 0.0) {
284
708
                                  double factor = A[i * n + k];
285
708
                                  A[i * n + k] = 0.0;
286
2838
                                  for (size_t j = 0; j < n; j++) {
287
2130
                                       A[i * n + j] -= factor * A[k * n + j];
288                                  }
289                        }
290                }
291        }
292
132
        Safefree(orig_diag);
293
132
        return rank;
294}
295
296// Internal extractor resolving single data values. Returns NAN on missing or non-numeric.
297
4872
static double get_data_value(HV *restrict data_hoa, HV **restrict row_hashes, unsigned int i, const char *restrict var) {
298
4872
    SV **restrict val = NULL;
299
4872
    if (row_hashes) {
300
3552
        val = hv_fetch(row_hashes[i], var, strlen(var), 0);
301
3552
        if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVAV) {
302
3552
            AV*restrict av = (AV*)SvRV(*val);
303
3552
            val = av_fetch(av, 0, 0);
304        }
305
1320
    } else if (data_hoa) {
306
1320
        SV**restrict col = hv_fetch(data_hoa, var, strlen(var), 0);
307
1320
        if (col && SvROK(*col) && SvTYPE(SvRV(*col)) == SVt_PVAV) {
308
1320
            AV*restrict av = (AV*)SvRV(*col);
309
1320
            val = av_fetch(av, i, 0);
310        }
311    }
312
4872
    if (val && SvOK(*val)) {
313
4863
        if (looks_like_number(*val)) return SvNV(*val);
314
0
        return NAN; // Catch strings like "blue"
315    }
316
9
    return NAN; // Catch undef/missing keys
317}
318
319// Helper: Get all available columns for the '.' operator expansion
320
6
static AV* get_all_columns(HV *restrict data_hoa, HV **restrict row_hashes, size_t n) {
321
6
    AV *cols = newAV();
322
6
    if (data_hoa) {
323
6
        hv_iterinit(data_hoa);
324        HE *entry;
325
24
        while ((entry = hv_iternext(data_hoa))) {
326
18
            av_push(cols, newSVsv(hv_iterkeysv(entry)));
327        }
328
0
    } else if (row_hashes && n > 0 && row_hashes[0]) {
329
0
        hv_iterinit(row_hashes[0]);
330        HE *entry;
331
0
        while ((entry = hv_iternext(row_hashes[0]))) {
332
0
            av_push(cols, newSVsv(hv_iterkeysv(entry)));
333        }
334    }
335
6
    return cols;
336}
337
338// Recursive formula resolver with tightened NaN and Null handling
339
4968
static double evaluate_term(HV *restrict data_hoa, HV **restrict row_hashes, unsigned int i, const char *restrict term) {
340
4968
    if (!term || term[0] == '\0') return NAN;
341
342
4968
    char *restrict term_cpy = savepv(term);
343
4968
    char *restrict colon = strchr(term_cpy, ':');
344
4968
    if (colon) {
345
96
        *colon = '\0';
346
96
        double left = evaluate_term(data_hoa, row_hashes, i, term_cpy);
347
96
        double right = evaluate_term(data_hoa, row_hashes, i, colon + 1);
348
96
        Safefree(term_cpy);
349
350
96
        if (isnan(left) || isnan(right)) return NAN;
351
96
        return left * right;
352    }
353
4872
    if (strncmp(term_cpy, "I(", 2) == 0) {
354
0
        char *restrict end = strrchr(term_cpy, ')');
355
0
        if (end) *end = '\0';
356
0
        char *restrict inner = term_cpy + 2;
357
0
        char *restrict caret = strchr(inner, '^');
358
0
        int power = 1;
359
0
        if (caret) {
360
0
            *caret = '\0';
361
0
            power = atoi(caret + 1);
362        }
363
0
        double v = get_data_value(data_hoa, row_hashes, i, inner);
364
0
        Safefree(term_cpy);
365
366
0
        if (isnan(v)) return NAN;
367
0
        return power == 1 ? v : pow(v, power);
368    }
369
4872
    double result = get_data_value(data_hoa, row_hashes, i, term_cpy);
370
4872
    Safefree(term_cpy);
371
4872
    return result;
372}
373
374// Helper to infer column type from its first valid element
375
162
static bool is_column_categorical(HV *restrict data_hoa, HV **restrict row_hashes, size_t n, const char *restrict var) {
376
258
        for (size_t i = 0; i < n; i++) {
377
255
                SV **restrict val = NULL;
378
255
                if (row_hashes) {
379
165
                        val = hv_fetch(row_hashes[i], var, strlen(var), 0);
380
165
                        if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVAV) {
381
69
                                 AV*restrict av = (AV*)SvRV(*val);
382
69
                                 val = av_fetch(av, 0, 0);
383                        }
384
90
                } else if (data_hoa) {
385
90
                        SV **restrict col = hv_fetch(data_hoa, var, strlen(var), 0);
386
90
                        if (col && SvROK(*col) && SvTYPE(SvRV(*col)) == SVt_PVAV) {
387
90
                                 AV*restrict av = (AV*)SvRV(*col);
388
90
                                 val = av_fetch(av, i, 0);
389                        }
390                }
391
255
                if (val && SvOK(*val)) {
392
159
                        if (looks_like_number(*val)) return false; // First valid is number -> Numeric Column
393
27
                        return true; // First valid is string -> Categorical Column
394                }
395        }
396
3
        return false;
397}
398
399/* Internal extractor resolving single data string values using dynamic allocation. */
400
1041
static char* get_data_string_alloc(HV *restrict data_hoa, HV **restrict row_hashes, size_t i, const char *restrict var) {
401
1041
        SV **restrict val = NULL;
402
1041
        if (row_hashes) {
403
0
                val = hv_fetch(row_hashes[i], var, strlen(var), 0);
404
0
                if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVAV) {
405
0
                        AV*restrict av = (AV*)SvRV(*val);
406
0
                        val = av_fetch(av, 0, 0);
407                }
408
1041
        } else if (data_hoa) {
409
1041
                SV **restrict col = hv_fetch(data_hoa, var, strlen(var), 0);
410
1041
                if (col && SvROK(*col) && SvTYPE(SvRV(*col)) == SVt_PVAV) {
411
1041
                        AV*restrict av = (AV*)SvRV(*col);
412
1041
                        val = av_fetch(av, i, 0);
413                }
414        }
415
1041
        if (val && SvOK(*val)) {
416
1041
          return savepv(SvPV_nolen(*val)); /* Allocates and returns string */
417        }
418
0
        return NULL;
419}
420
421// Struct for sorting p-values while remembering their original index
422typedef struct {
423        double p;
424        size_t orig_idx;
425} PVal;
426
427// Comparator for qsort
428
4557
static int cmp_pval(const void *restrict a, const void *restrict b) {
429
4557
        double diff = ((PVal*)a)->p - ((PVal*)b)->p;
430
4557
        if (diff < 0) return -1;
431
2436
        if (diff > 0) return 1;
432        /* Stabilize sort by falling back to original index */
433
0
        return ((PVal*)a)->orig_idx - ((PVal*)b)->orig_idx;
434}
435/* -----------------------------------------------------------------------
436 * Helpers for cor(): ranking (Spearman), Pearson r, Kendall tau-b
437 * ----------------------------------------------------------------------- */
438/* Item used to sort values while remembering their original index,
439 * needed for average-rank tie-breaking in Spearman correlation.        */
440typedef struct {
441        double val;
442        size_t idx;
443} RankItem;
444
445
111
static int cmp_rank_item(const void *restrict a, const void *restrict b) {
446
111
        double diff = ((RankItem*)a)->val - ((RankItem*)b)->val;
447
111
        if (diff < 0) return -1;
448
12
        if (diff > 0) return  1;
449
3
        return 0;
450}
451
452/* Compute 1-based average ranks with tie-breaking into out[].
453 * in[] is not modified.                                                 */
454
12
static void rank_data(const double *restrict in, double *restrict out, size_t n) {
455        RankItem *restrict ri;
456
12
        Newx(ri, n, RankItem);
457
96
        for (size_t i = 0; i < n; i++) { ri[i].val = in[i]; ri[i].idx = i; }
458
12
        qsort(ri, n, sizeof(RankItem), cmp_rank_item);
459
460
12
        size_t i = 0;
461
93
        while (i < n) {
462
81
                size_t j = i;
463                /* Find the full extent of this tie group */
464
84
                while (j + 1 < n && ri[j + 1].val == ri[j].val) j++;
465                /* All members get the average of ranks i+1 … j+1 (1-based) */
466
81
                double avg = (double)(i + j) / 2.0 + 1.0;
467
165
                for (size_t k = i; k <= j; k++) out[ri[k].idx] = avg;
468
81
                i = j + 1;
469        }
470
12
        Safefree(ri);
471
12
}
472
473/* Pearson product-moment r between two n-element arrays.
474 * Returns NAN when either variable has zero variance (matches R).       */
475
18
static double pearson_corr(const double *restrict x, const double *restrict y, size_t n) {
476
18
        double sx = 0, sy = 0, sxy = 0, sx2 = 0, sy2 = 0;
477
108
        for (size_t i = 0; i < n; i++) {
478
90
          sx  += x[i];     sy  += y[i];
479
90
          sxy += x[i]*y[i]; sx2 += x[i]*x[i]; sy2 += y[i]*y[i];
480        }
481
18
        double num = (double)n * sxy - sx * sy;
482
18
        double den = sqrt(((double)n * sx2 - sx*sx) * ((double)n * sy2 - sy*sy));
483
18
        if (den == 0.0) return NAN;
484
18
        return num / den;
485}
486
487/* Kendall's tau-b between two n-element arrays.
488 *
489 *   tau-b = (C − D) / sqrt((C + D + T_x)(C + D + T_y))
490 *
491 * where C = concordant pairs, D = discordant, T_x = pairs tied only on
492 * x, T_y = pairs tied only on y.  Joint ties (both zero) are excluded
493 * from numerator and denominator, matching R's cor(method="kendall").
494 * Returns NAN when the denominator is zero.                             */
495
3
static double kendall_tau_b(const double *restrict x, const double *restrict y, unsigned int n) {
496
3
        size_t C = 0, D = 0, tie_x = 0, tie_y = 0;
497
27
        for (size_t i = 0; i < n - 1; i++) {
498
132
          for (size_t j = i + 1; j < n; j++) {
499
108
                   int sx = (x[i] > x[j]) - (x[i] < x[j]);   /* sign of x[i]-x[j] */
500
108
                   int sy = (y[i] > y[j]) - (y[i] < y[j]);
501
108
                   if      (sx == 0 && sy == 0) { /* joint tie — not counted */ }
502
108
                   else if (sx == 0)            tie_x++;
503
105
                   else if (sy == 0)            tie_y++;
504
105
                   else if (sx == sy)           C++;
505
0
                   else                         D++;
506          }
507        }
508
3
        double denom = sqrt((double)(C + D + tie_x) * (double)(C + D + tie_y));
509
3
        if (denom == 0.0) return NAN;
510
3
        return (double)(C - D) / denom;
511}
512
513/* Single dispatch: compute correlation according to method string.
514 * Allocates and frees temporary rank arrays internally for Spearman.   */
515
21
static double compute_cor(const double *restrict x, const double *restrict y,
516                           size_t n, const char *restrict method) {
517
21
        if (strcmp(method, "spearman") == 0) {
518          double *restrict rx, *restrict ry;
519
3
          Newx(rx, n, double); Newx(ry, n, double);
520
3
          rank_data(x, rx, n);
521
3
          rank_data(y, ry, n);
522
3
          double r = pearson_corr(rx, ry, n);
523
3
          Safefree(rx); Safefree(ry);
524
3
          return r;
525        }
526
18
        if (strcmp(method, "kendall") == 0)
527
3
          return kendall_tau_b(x, y, n);
528        /* default: pearson */
529
15
        return pearson_corr(x, y, n);
530}
531
532// Math macros
533#define MAX_ITER 500
534#define EPS 3.0e-15
535#define FPMIN 1.0e-30
536
537
24459
static double _incbeta_cf(double a, double b, double x) {
538        int m;
539        double aa, c, d, del, h, qab, qam, qap;
540
24459
        qab = a + b; qap = a + 1.0; qam = a - 1.0;
541
24459
        c = 1.0; d = 1.0 - qab * x / qap;
542
24459
        if (fabs(d) < FPMIN) d = FPMIN;
543
24459
        d = 1.0 / d; h = d;
544
540939
        for (m = 1; m <= MAX_ITER; m++) {
545
540939
          int m2 = 2 * m;
546
540939
          aa = m * (b - m) * x / ((qam + m2) * (a + m2));
547
540939
          d = 1.0 + aa * d;
548
540939
          if (fabs(d) < FPMIN) d = FPMIN;
549
540939
          c = 1.0 + aa / c;
550
540939
          if (fabs(c) < FPMIN) c = FPMIN;
551
540939
          d = 1.0 / d; h *= d * c;
552
540939
          aa = -(a + m) * (qab + m) * x / ((a + m2) * (qap + m2));
553
540939
          d = 1.0 + aa * d;
554
540939
          if (fabs(d) < FPMIN) d = FPMIN;
555
540939
          c = 1.0 + aa / c;
556
540939
          if (fabs(c) < FPMIN) c = FPMIN;
557
540939
          d = 1.0 / d; del = d * c; h *= del;
558
540939
          if (fabs(del - 1.0) < EPS) break;
559        }
560
24459
        return h;
561}
562
563
24582
static double incbeta(double a, double b, double x) {
564
24582
        if (x <= 0.0) return 0.0;
565
24582
        if (x >= 1.0) return 1.0;
566
24459
        double bt = exp(lgamma(a + b) - lgamma(a) - lgamma(b) + a * log(x) + b * log(1.0 - x));
567
24459
        if (x < (a + 1.0) / (a + b + 2.0)) return bt * _incbeta_cf(a, b, x) / a;
568
4284
        return 1.0 - bt * _incbeta_cf(b, a, 1.0 - x) / b;
569}
570
571
24504
static double get_t_pvalue(double t, double df, const char*restrict alt) {
572
24504
        double x = df / (df + t * t);
573
24504
        double prob_2tail = incbeta(df / 2.0, 0.5, x);
574
24504
        if (strcmp(alt, "less") == 0) return (t < 0) ? 0.5 * prob_2tail : 1.0 - 0.5 * prob_2tail;
575
24498
        if (strcmp(alt, "greater") == 0) return (t > 0) ? 0.5 * prob_2tail : 1.0 - 0.5 * prob_2tail;
576
312
        return prob_2tail;
577}
578
579// Bisection algorithm to find the inverse t-distribution (Critical t-value)
580
813
static double qt_tail(double df, double p_tail) {
581
813
        double low = 0.0, high = 1.0;
582        // Find upper bound
583
1929
        while (get_t_pvalue(high, df, "greater") > p_tail) {
584
1116
          low = high;
585
1116
          high *= 2.0;
586
1116
          if (high > 1000000.0) break; /* Fallback limit */
587        }
588        // Bisect to find the root
589
22254
        for (unsigned short int i = 0; i < 100; i++) {
590
22254
          double mid = (low + high) / 2.0;
591
22254
          double p_mid = get_t_pvalue(mid, df, "greater");
592
22254
          if (p_mid > p_tail) {
593
10869
                   low = mid;
594          } else {
595
11385
                   high = mid;
596          }
597
22254
          if (high - low < 1e-8) break;
598        }
599
813
        return (low + high) / 2.0;
600}
601
602
8505
int compare_doubles(const void *restrict a, const void *restrict b) {
603
8505
        double da = *(const double*restrict)a;
604
8505
        double db = *(const double*restrict)b;
605
8505
        return (da > db) - (da < db);
606}
607/* Helper to calculate the number of bins using Sturges' formula: log2(n) + 1 */
608
0
static size_t calculate_sturges_bins(size_t n) {
609
0
        if (n == 0) return 1;
610
0
        return (size_t)(log((double)n) / log(2.0) + 1.0);
611}
612
613// Logic for distributing data into bins (Optimized to O(N))
614
15
static void compute_hist_logic(double *restrict x, size_t n, double *restrict breaks, size_t n_bins,
615                               size_t *restrict counts, double *restrict mids, double *restrict density) {
616
15
        double total_n = (double)n;
617
15
        double min_val = breaks[0];
618
15
        double step = (n_bins > 0) ? (breaks[1] - breaks[0]) : 0.0;
619        // Initialize counts and compute midpoints
620
69
        for (size_t i = 0; i < n_bins; i++) {
621
54
          counts[i] = 0;
622
54
          mids[i] = (breaks[i] + breaks[i+1]) / 2.0;
623        }
624        // Single O(N) pass to assign elements to bins
625
15
        if (step > 0.0) {
626
6051
          for (size_t j = 0; j < n; j++) {
627
6042
                   double val = x[j];
628                   // Ignore out-of-bounds or invalid values
629
6042
                   if (isnan(val) || isinf(val) || val < min_val) continue;
630                   // Calculate initial bin index mathematically
631
6042
                   size_t idx = (size_t)((val - min_val) / step);
632                   // Clamp to valid array bounds first to prevent overflow */
633
6042
                   if (idx >= n_bins) {
634
9
                       idx = n_bins - 1;
635                   }
636                   /* Adjust for exact boundaries (R's right-inclusive default: (a, b]) */
637                   /* If value is exactly on or slightly below the lower boundary of the assigned bin,
638                      it belongs in the previous bin. (First bin [a, b] is inclusive on both ends) */
639
6069
                   while (idx > 0 && val <= breaks[idx]) {
640
27
                       idx--;
641                   }
642                   // Conversely, if floating-point truncation placed it too low, push it up
643
6042
                   while (idx < n_bins - 1 && val > breaks[idx + 1]) {
644
0
                       idx++;
645                   }
646
6042
                   counts[idx]++;
647          }
648
6
        } else if (n_bins > 0) {
649          // Edge case: All data points have the exact same value (step == 0)
650
6
          counts[0] = n;
651        }
652        // Compute densities
653
69
        for (size_t i = 0; i < n_bins; i++) {
654
54
          double bin_width = breaks[i+1] - breaks[i];
655
54
          if (bin_width > 0) {
656
48
                   density[i] = (double)counts[i] / (total_n * bin_width);
657          } else {
658
6
                   density[i] = (n_bins == 1) ? 1.0 : 0.0;
659          }
660        }
661
15
}
662
663// Standard Normal CDF approximation
664
168
double approx_pnorm(double x) {
665
168
        return 0.5 * erfc(-x * 0.70710678118654752440); // 0.707... = 1/sqrt(2)
666}
667#ifndef M_SQRT1_2
668#define M_SQRT1_2 0.70710678118654752440
669#endif
670
671/* Macro for exact Wilcoxon 3D array indexing */
672#define DP_INDEX(i, j, k, n2, max_u) ((i) * ((n2) + 1) * ((max_u) + 1) + (j) * ((max_u) + 1) + (k))
673
84
static double inverse_normal_cdf(double p) {
674
84
        double a[4] = {2.50662823884, -18.61500062529, 41.39119773534, -25.44106049637};
675
84
        double b[4] = {-8.47351093090, 23.08336743743, -21.06224101826, 3.13082909833};
676
84
        double c[9] = {0.3374754822726147, 0.9761690190917186, 0.1607979714918209,
677                          0.0276438810333863, 0.0038405729373609, 0.0003951896511919,
678                          0.0000321767881768, 0.0000002888167364, 0.0000003960315187};
679        double x, r, y;
680
84
        y = p - 0.5;
681
84
        if (fabs(y) < 0.42) {
682
66
          r = y * y;
683
66
          x = y * (((a[3]*r + a[2])*r + a[1])*r + a[0]) /
684
66
                       ((((b[3]*r + b[2])*r + b[1])*r + b[0])*r + 1.0);
685        } else {
686
18
          r = p;
687
18
          if (y > 0) r = 1.0 - p;
688
18
          r = log(-log(r));
689
18
          x = c[0] + r * (c[1] + r * (c[2] + r * (c[3] + r * (c[4] +
690
18
                   r * (c[5] + r * (c[6] + r * (c[7] + r * c[8])))))));
691
18
          if (y < 0) x = -x;
692        }
693
84
        return x;
694}
695/* -----------------------------------------------------------------------
696 * Exact Spearman p-value via exhaustive permutation enumeration.
697 *
698 * Under H0, all n! orderings of ranks are equally probable.  We visit
699 * every permutation of {1..n} with Heap's algorithm (O(n!), no allocs
700 * inside the loop) and count how many yield S ≤ s_obs ("lower tail",
701 * i.e. rho ≥ rho_obs) and how many yield S ≥ s_obs ("upper tail").
702 *
703 * Mirrors R's default: exact = (n < 10) with no ties.
704 * Valid up to n = 9 (362 880 iterations — negligible cost).
705 * ----------------------------------------------------------------------- */
706
3
static double spearman_exact_pvalue(double s_obs, size_t n, const char *restrict alt) {
707
3
        int *restrict perm = (int*)safemalloc(n * sizeof(int));
708
3
        int *restrict c    = (int*)safemalloc(n * sizeof(int));
709
18
        for (size_t i = 0; i < n; i++) { perm[i] = i + 1; c[i] = 0; }
710
711
3
        long count_le = 0, count_ge = 0, total = 0;
712
713        #define TALLY_PERM() do {                                    \
714          double s_ = 0.0;                                     \
715          for (int ii = 0; ii < n; ii++) {                    \
716                   double d_ = (double)(ii + 1) - (double)perm[ii];\
717                   s_ += d_ * d_;                                   \
718          }                                                    \
719          if (s_ <= s_obs + 1e-9) count_le++;                 \
720          if (s_ >= s_obs - 1e-9) count_ge++;                 \
721          total++;                                             \
722        } while (0)
723
724
18
        TALLY_PERM();   /* initial permutation [1, 2, ..., n] */
725
726
3
        unsigned int k = 1;
727
618
        while (k < n) {
728
615
          if (c[k] < k) {
729                   int tmp;
730
357
                   if (k % 2 == 0) {
731
132
                       tmp = perm[0]; perm[0] = perm[k]; perm[k] = tmp;
732                   } else {
733
225
                       tmp = perm[c[k]]; perm[c[k]] = perm[k]; perm[k] = tmp;
734                   }
735
2142
                   TALLY_PERM();
736
357
                   c[k]++;
737
357
                   k = 1;
738          } else {
739
258
                   c[k] = 0;
740
258
                   k++;
741          }
742        }
743        #undef TALLY_PERM
744
745
3
        Safefree(perm); Safefree(c);
746        /* p_le = P(S ≤ s_obs) ≡ P(rho ≥ rho_obs)  â€” upper rho tail
747        * p_ge = P(S ≥ s_obs) ≡ P(rho ≤ rho_obs)  â€” lower rho tail  */
748
3
        double p_le = (double)count_le / (double)total;
749
3
        double p_ge = (double)count_ge / (double)total;
750
751
3
        if (strcmp(alt, "greater") == 0) return p_le;
752
3
        if (strcmp(alt, "less")    == 0) return p_ge;
753        /* two.sided: 2 × the smaller tail, clamped to 1 */
754
3
        double p = 2.0 * (p_le < p_ge ? p_le : p_ge);
755
3
        return (p > 1.0) ? 1.0 : p;
756}
757/* -----------------------------------------------------------------------
758 * Exact Kendall p-value via Mahonian Numbers (Inversions distribution)
759 * Matches R's behavior for N < 50 without ties.
760 * ----------------------------------------------------------------------- */
761
6
static double kendall_exact_pvalue(size_t n, double s_obs, const char *restrict alt) {
762
6
        long max_inv = (long)n * (n - 1) / 2;
763
6
        double *restrict dp = (double*)safemalloc((max_inv + 1) * sizeof(double));
764
72
        for (long i = 0; i <= max_inv; i++) dp[i] = 0.0;
765
6
        dp[0] = 1.0;
766        /* Build the distribution of inversions via DP */
767
30
        for (size_t i = 2; i <= n; i++) {
768
24
          double *restrict next_dp = (double*)safemalloc((max_inv + 1) * sizeof(double));
769
288
          for (long k = 0; k <= max_inv; k++) next_dp[k] = 0.0;
770
24
          int current_max_inv = i * (i - 1) / 2;
771
168
          for (int k = 0; k <= current_max_inv; k++) {
772
144
                   double sum = 0;
773
618
                   for (int j = 0; j <= i - 1 && k - j >= 0; j++) {
774
474
                       sum += dp[k - j];
775                   }
776                   // Divide by 'i' directly to keep array as pure probabilities and prevent overflow
777
144
                   next_dp[k] = sum / (double)i;
778          }
779
24
          Safefree(dp);
780
24
          dp = next_dp;
781        }
782        // Convert S statistic to target number of inversions
783
6
        long i_obs = (long)round((max_inv - s_obs) / 2.0);
784
6
        if (i_obs < 0) i_obs = 0;
785
6
        if (i_obs > max_inv) i_obs = max_inv;
786
6
        double p_le = 0.0; /* P(S <= S_obs) */
787
60
        for (long k = i_obs; k <= max_inv; k++) p_le += dp[k];
788
6
        double p_ge = 0.0; /* P(S >= S_obs) */
789
24
        for (long k = 0; k <= i_obs; k++) p_ge += dp[k];
790
6
        Safefree(dp);
791
6
        if (strcmp(alt, "greater") == 0) return p_ge;
792
6
        if (strcmp(alt, "less") == 0) return p_le;
793        // two.sided
794
3
        double p = 2.0 * (p_ge < p_le ? p_ge : p_le);
795
3
        return p > 1.0 ? 1.0 : p;
796}
797// F-distribution Cumulative Distribution Function P(F <= f)
798
78
static double pf(double f, double df1, double df2) {
799
78
        if (f <= 0.0) return 0.0;
800
78
        double x = (df1 * f) / (df1 * f + df2);
801
78
        return incbeta(df1 / 2.0, df2 / 2.0, x);
802}
803
804/* Householder QR Decomposition for Sequential Sums of Squares */
805/* Householder QR Decomposition for Sequential Sums of Squares */
806
18
static void apply_householder_aov(double** restrict X, double* restrict y, size_t n, size_t p, bool* restrict aliased, size_t* restrict rank_map) {
807
18
        size_t r = 0; // Rank/Row tracker
808
72
        for (size_t k = 0; k < p; k++) {
809
54
                aliased[k] = false;
810
54
                if (r >= n) {
811
0
                        aliased[k] = true;
812
0
                        continue;
813                }
814
815
54
                double max_val = 0;
816
489
                for (size_t i = r; i < n; i++) {
817
435
                        if (fabs(X[i][k]) > max_val) max_val = fabs(X[i][k]);
818                }
819
54
                if (max_val < 1e-10) {
820
3
                        aliased[k] = true;
821
3
                        continue;
822                } // Collinear or zero column
823
824
51
                double norm = 0;
825
477
                for (size_t i = r; i < n; i++) {
826
426
                        X[i][k] /= max_val;
827
426
                        norm += X[i][k] * X[i][k];
828                }
829
51
                norm = sqrt(norm);
830
51
                double s = (X[r][k] > 0) ? -norm : norm;
831
51
                double u1 = X[r][k] - s;
832
51
                X[r][k] = s * max_val;
833
834
108
                for (size_t j = k + 1; j < p; j++) {
835
57
                        double dot = u1 * X[r][j];
836
570
                        for (size_t i = r + 1; i < n; i++) dot += X[i][j] * X[i][k];
837
57
                        double tau = dot / (s * u1);
838
57
                        X[r][j] += tau * u1;
839
570
                        for (size_t i = r + 1; i < n; i++) X[i][j] += tau * X[i][k];
840                }
841
842                // Transform the response vector y
843
51
                double dot_y = u1 * y[r];
844
426
                for (size_t i = r + 1; i < n; i++) dot_y += y[i] * X[i][k];
845
51
                double tau_y = dot_y / (s * u1);
846
51
                y[r] += tau_y * u1;
847
426
                for (size_t i = r + 1; i < n; i++) y[i] += tau_y * X[i][k];
848
849
51
                rank_map[k] = r; // Map original column index to orthogonal row index
850
51
                r++;
851        }
852
18
}
853
854// --- write_table Helpers ---
855
856// Sorts string arrays alphabetically
857
75
static int cmp_string_wt(const void *a, const void *b) {
858
75
        return strcmp(*(const char**)a, *(const char**)b);
859}
860
861// Emulates Perl's /\D/ check
862
13
static bool contains_nondigit(SV *restrict sv) {
863
13
        if (!sv || !SvOK(sv)) return 0;
864        STRLEN len;
865
13
        const char *restrict s = SvPVbyte(sv, len);
866
26
        for (size_t i = 0; i < len; i++) {
867
13
          if (!isdigit(s[i])) return 1;
868        }
869
13
        return 0;
870}
871
872// Writes a properly quoted string dynamically
873
522
static void print_str_quoted(PerlIO *fh, const char *str, const char *sep) {
874
522
        if (!str) str = "";
875
522
        bool needs_quotes = 0;
876
522
        if (strstr(str, sep) != NULL || strchr(str, '"') != NULL || strchr(str, '\r') != NULL || strchr(str, '\n') != NULL) {
877
33
          needs_quotes = 1;
878        }
879
880
522
        if (needs_quotes) {
881
33
          PerlIO_putc(fh, '"');
882
354
          for (const char *restrict p = str; *p; p++) {
883
321
                   if (*p == '"') {
884
21
                       PerlIO_putc(fh, '"');
885
21
                       PerlIO_putc(fh, '"');
886                   } else {
887
300
                       PerlIO_putc(fh, *p);
888                   }
889          }
890
33
          PerlIO_putc(fh, '"');
891        } else {
892
489
          PerlIO_puts(fh, str);
893        }
894
522
}
895
896// Writes an array of strings joined by sep
897
141
static void print_string_row(PerlIO *fh, const char **row, size_t len, const char *sep) {
898
141
        size_t sep_len = strlen(sep);
899
663
        for (size_t i = 0; i < len; i++) {
900
522
          if (i > 0) PerlIO_write(fh, sep, sep_len);
901
522
          if (row[i]) {
902
522
                   print_str_quoted(fh, row[i], sep);
903          } else {
904
0
                   print_str_quoted(fh, "", sep);
905          }
906        }
907
141
        PerlIO_putc(fh, '\n');
908
141
}
909// Calculates the Regularized Upper Incomplete Gamma Function Q(a, x)
910// This perfectly replicates R's pchisq(..., lower.tail=FALSE)
911
17
double igamc(double a, double x) {
912
17
        if (x < 0.0 || a <= 0.0) return 1.0;
913
17
        if (x == 0.0) return 1.0;
914
915        // Series expansion for x < a + 1
916
17
        if (x < a + 1.0) {
917
8
                double sum = 1.0 / a;
918
8
                double term = 1.0 / a;
919
8
                double n = 1.0;
920
112
                while (fabs(term) > 1e-15) {
921
104
                        term *= x / (a + n);
922
104
                        sum += term;
923
104
                        n += 1.0;
924                }
925
8
                return 1.0 - (sum * exp(-x + a * log(x) - lgamma(a)));
926        }
927
928        // Continued fraction for x >= a + 1
929
9
        double b = x + 1.0 - a;
930
9
        double c = 1.0 / 1e-30;
931
9
        double d = 1.0 / b;
932
9
        double h = d, i = 1.0;
933
141
        while (i < 10000) { // Safety bound
934
141
                double an = -i * (i - a);
935
141
                b += 2.0;
936
141
                d = an * d + b;
937
141
                if (fabs(d) < 1e-30) d = 1e-30;
938
141
                c = b + an / c;
939
141
                if (fabs(c) < 1e-30) c = 1e-30;
940
141
                d = 1.0 / d;
941
141
                double del = d * c;
942
141
                h *= del;
943
141
                if (fabs(del - 1.0) < 1e-15) break;
944
132
                i += 1.0;
945        }
946
9
        return h * exp(-x + a * log(x) - lgamma(a));
947}
948
949// Chi-Squared p-value is simply the Incomplete Gamma of (df/2, stat/2)
950
17
double get_p_value(double stat, int df) {
951
17
        if (df <= 0) return 1.0;
952
17
        if (stat <= 0.0) return 1.0;
953
17
        return igamc((double)df / 2.0, stat / 2.0);
954}
955
956/* --- C HELPER SECTION --- */
957#ifndef M_SQRT1_2
958#define M_SQRT1_2 0.70710678118654752440
959#endif
960
961/* Robust Binomial Coefficient using long double */
962
6
static long double choose_comb(int n, int k) {
963
6
        if (k < 0 || k > n) return 0.0L;
964
6
        if (k > n / 2) k = n - k;
965
6
        long double res = 1.0L;
966
24
        for (int i = 1; i <= k; i++) {
967
18
          res = res * (long double)(n - i + 1) / (long double)i;
968        }
969
6
        return res;
970}
971
972/* Exact CDF for Mann-Whitney U: P(U <= q)
973   Mathematically identical to R's cwilcox generating function */
974
12
static double exact_pwilcox(double q, int m, int n) {
975
12
    int k = (int)floor(q + 1e-7); // R uses 1e-7 fuzz
976
12
    int max_u = m * n;
977
12
    if (k < 0) return 0.0;
978
6
    if (k >= max_u) return 1.0;
979
980
6
    long double *restrict w = (long double *)safecalloc(max_u + 1, sizeof(long double));
981
6
    w[0] = 1.0L;
982
983
24
    for (int j = 1; j <= n; j++) {
984
162
        for (int i = j; i <= max_u; i++) w[i] += w[i - j];
985
108
        for (int i = max_u; i >= j + m; i--) w[i] -= w[i - j - m];
986    }
987
988
6
    long double cum_p = 0.0L;
989
12
    for (int i = 0; i <= k; i++) cum_p += w[i];
990
991
6
    long double total = choose_comb(m + n, n);
992
6
    double result = (double)(cum_p / total);
993
994
6
    Safefree(w);
995
6
    return result;
996}
997
998/* Exact CDF for Wilcoxon Signed Rank: P(V <= q)
999   Mathematically identical to R's csignrank subset-sum DP */
1000
18
static double exact_psignrank(double q, int n) {
1001
18
        int k = (int)floor(q + 1e-7);
1002
18
        int max_v = n * (n + 1) / 2;
1003
18
        if (k < 0) return 0.0;
1004
18
        if (k >= max_v) return 1.0;
1005
1006
15
        long double *restrict w = (long double *)safecalloc(max_v + 1, sizeof(long double));
1007
15
        w[0] = 1.0L;
1008
1009
138
        for (int i = 1; i <= n; i++) {
1010
4746
          for (int j = max_v; j >= i; j--) w[j] += w[j - i];
1011        }
1012
1013
15
        long double cum_p = 0.0L;
1014
546
        for (int i = 0; i <= k; i++) cum_p += w[i];
1015
1016
15
        long double total = powl(2.0L, (long double)n);
1017
15
        double result = (double)(cum_p / total);
1018
1019
15
        Safefree(w);
1020
15
        return result;
1021}
1022
1023
860
static int cmp_rank_info(const void *a, const void *b) {
1024
860
        double da = ((const RankInfo*)a)->val;
1025
860
        double db = ((const RankInfo*)b)->val;
1026
860
        return (da > db) - (da < db);
1027}
1028
1029
32
static double rank_and_count_ties(RankInfo *restrict ri, size_t n, bool *restrict has_ties) {
1030
32
        if (n == 0) return 0.0;
1031
32
        qsort(ri, n, sizeof(RankInfo), cmp_rank_info);
1032
32
        size_t i = 0;
1033
32
        double tie_adj = 0.0;
1034
32
        *has_ties = 0;
1035
357
        while (i < n) {
1036
325
                size_t j = i + 1;
1037
349
                while (j < n && ri[j].val == ri[i].val) j++;
1038
325
                double r = (double)(i + 1 + j) / 2.0;
1039
674
                for (size_t k = i; k < j; k++) ri[k].rank = r;
1040
325
                size_t t = j - i;
1041
325
                if (t > 1) { *has_ties = 1; tie_adj += ((double)t * t * t - t); }
1042
325
                i = j;
1043        }
1044
32
        return tie_adj;
1045}
1046/* --- KS-TEST C HELPER SECTION --- */
1047#ifndef M_PI_2
1048#define M_PI_2 1.57079632679489661923
1049#endif
1050#ifndef M_PI_4
1051#define M_PI_4 0.78539816339744830962
1052#endif
1053#ifndef M_1_SQRT_2PI
1054#define M_1_SQRT_2PI 0.39894228040143267794
1055#endif
1056
1057// Scalar integer power used by K2x
1058
117
static double r_pow_di(double x, int n) {
1059
117
        if (n == 0) return 1.0;
1060
117
        if (n < 0) return 1.0 / r_pow_di(x, -n);
1061
117
        double val = 1.0;
1062
1314
        for (int i = 0; i < n; i++) val *= x;
1063
117
        return val;
1064}
1065
1066// Two-sample two-sided asymptotic distribution
1067
0
static double K2l(double x, int lower, double tol) {
1068        double s, z, p;
1069        int k;
1070
0
        if(x <= 0.) {
1071
0
          if(lower) p = 0.;
1072
0
          else p = 1.;
1073
0
        } else if(x < 1.) {
1074
0
          int k_max = (int) sqrt(2.0 - log(tol));
1075
0
          double w = log(x);
1076
0
          z = - (M_PI_2 * M_PI_4) / (x * x);
1077
0
          s = 0;
1078
0
          for(k = 1; k < k_max; k += 2) {
1079
0
                   s += exp(k * k * z - w);
1080          }
1081
0
          p = s / M_1_SQRT_2PI;
1082
0
          if(!lower) p = 1.0 - p;
1083        } else {
1084          double new_val, old_val;
1085
0
          z = -2.0 * x * x;
1086
0
          s = -1.0;
1087
0
          if(lower) {
1088
0
                   k = 1; old_val = 0.0; new_val = 1.0;
1089          } else {
1090
0
                   k = 2; old_val = 0.0; new_val = 2.0 * exp(z);
1091          }
1092
0
          while(fabs(old_val - new_val) > tol) {
1093
0
                   old_val = new_val;
1094
0
                   new_val += 2.0 * s * exp(z * k * k);
1095
0
                   s *= -1.0;
1096
0
                   k++;
1097          }
1098
0
          p = new_val;
1099        }
1100
0
        return p;
1101}
1102
1103// Auxiliary routines used by K2x() for matrix operations
1104
21
static void m_multiply(double *A, double *B, double *C, unsigned int m) {
1105
420
        for(unsigned int i = 0; i < m; i++) {
1106
7980
          for(unsigned int j = 0; j < m; j++) {
1107
7581
                   double s = 0.;
1108
151620
                   for(unsigned int k = 0; k < m; k++) s += A[i * m + k] * B[k * m + j];
1109
7581
                   C[i * m + j] = s;
1110          }
1111        }
1112
21
}
1113
1114
18
static void m_power(double *A, int eA, double *V, int *eV, int m, int n) {
1115
18
    if(n == 1) {
1116
1086
        for(int i = 0; i < m * m; i++) V[i] = A[i];
1117
3
        *eV = eA;
1118
3
        return;
1119    }
1120
15
    m_power(A, eA, V, eV, m, n / 2);
1121
15
    double *restrict B = (double*) safecalloc(m * m, sizeof(double));
1122
15
    m_multiply(V, V, B, m);
1123
15
    int eB = 2 * (*eV);
1124
15
    if((n % 2) == 0) {
1125
3258
        for(int i = 0; i < m * m; i++) V[i] = B[i];
1126
9
        *eV = eB;
1127    } else {
1128
6
        m_multiply(A, B, V, m);
1129
6
        *eV = eA + eB;
1130    }
1131
15
    if(V[(m / 2) * m + (m / 2)] > 1e140) {
1132
0
        for(int i = 0; i < m * m; i++) V[i] = V[i] * 1e-140;
1133
0
        *eV += 140;
1134    }
1135
15
    Safefree(B);
1136}
1137
1138// One-sample two-sided exact distribution
1139
3
static double K2x(int n, double d) {
1140
3
        int k = (int) (n * d) + 1;
1141
3
        int m = 2 * k - 1;
1142
3
        double h = k - n * d;
1143
3
        double *restrict H = (double*) safecalloc(m * m, sizeof(double));
1144
3
        double *restrict Q = (double*) safecalloc(m * m, sizeof(double));
1145
1146
60
        for(int i = 0; i < m; i++) {
1147
1140
          for(int j = 0; j < m; j++) {
1148
1083
                   if(i - j + 1 < 0) H[i * m + j] = 0;
1149
624
                   else H[i * m + j] = 1;
1150          }
1151        }
1152
60
        for(int i = 0; i < m; i++) {
1153
57
          H[i * m] -= r_pow_di(h, i + 1);
1154
57
          H[(m - 1) * m + i] -= r_pow_di(h, (m - i));
1155        }
1156
3
        H[(m - 1) * m] += ((2 * h - 1 > 0) ? r_pow_di(2 * h - 1, m) : 0);
1157
1158
60
        for(int i = 0; i < m; i++) {
1159
1140
          for(int j = 0; j < m; j++) {
1160
1083
                   if(i - j + 1 > 0) {
1161
4560
                       for(int g = 1; g <= i - j + 1; g++) H[i * m + j] /= g;
1162                   }
1163          }
1164        }
1165
1166
3
        int eH = 0, eQ;
1167
3
        m_power(H, eH, Q, &eQ, m, n);
1168
3
        double s = Q[(k - 1) * m + k - 1];
1169
1170
153
        for(int i = 1; i <= n; i++) {
1171
150
          s = s * (double)i / (double)n;
1172
150
          if(s < 1e-140) {
1173
0
                   s *= 1e140;
1174
0
                   eQ -= 140;
1175          }
1176        }
1177
3
        s *= pow(10.0, eQ);
1178
3
        Safefree(H);
1179
3
        Safefree(Q);
1180
3
        return s;
1181}
1182
1183// Calculate D (two-sided), D+ (greater), and D- (less) simultaneously
1184
9
static void calc_2sample_stats(double *x, size_t nx, double *y, size_t ny,
1185                               double *d, double *d_plus, double *d_minus) {
1186
9
        qsort(x, nx, sizeof(double), compare_doubles);
1187
9
        qsort(y, ny, sizeof(double), compare_doubles);
1188
9
        double max_d = 0.0, max_d_plus = 0.0, max_d_minus = 0.0;
1189
9
        size_t i = 0, j = 0;
1190
1191
729
        while(i < nx || j < ny) {
1192          double val;
1193
720
          if (i < nx && j < ny) val = (x[i] < y[j]) ? x[i] : y[j];
1194
117
          else if (i < nx) val = x[i];
1195
0
          else val = y[j];
1196
1197
1170
          while(i < nx && x[i] <= val) i++;
1198
990
          while(j < ny && y[j] <= val) j++;
1199
1200
720
          double cdf1 = (double)i / nx;
1201
720
          double cdf2 = (double)j / ny;
1202
720
          double diff = cdf1 - cdf2;
1203
1204
720
          if (diff > max_d_plus) max_d_plus = diff;
1205
720
          if (-diff > max_d_minus) max_d_minus = -diff;
1206
720
          if (fabs(diff) > max_d) max_d = fabs(diff);
1207        }
1208
9
        *d = max_d;
1209
9
        *d_plus = max_d_plus;
1210
9
        *d_minus = max_d_minus;
1211
9
}
1212
1213// Branch the DP boundary check based on the 'alternative'
1214
14220
static int psmirnov_exact_test(double q, double r, double s, int two_sided) {
1215
14220
        if (two_sided) return (fabs(r - s) >= q);
1216
9480
        return ((r - s) >= q); // Used for both D+ and D- via symmetry
1217}
1218
1219// Evaluate the exact 2-sample probability
1220
9
static double psmirnov_exact_uniq_upper(double q, int m, int n, int two_sided) {
1221
9
        double md = (double) m, nd = (double) n;
1222
9
        double *u = (double *) safecalloc(n + 1, sizeof(double));
1223
9
        u[0] = 0.;
1224
1225
279
        for(unsigned int j = 1; j <= n; j++) {
1226
270
          if(psmirnov_exact_test(q, 0., j / nd, two_sided)) u[j] = 1.;
1227
216
          else u[j] = u[j - 1];
1228        }
1229
459
        for(unsigned int i = 1; i <= m; i++) {
1230
450
          if(psmirnov_exact_test(q, i / md, 0., two_sided)) u[0] = 1.;
1231
13950
          for(int j = 1; j <= n; j++) {
1232
13500
                   if(psmirnov_exact_test(q, i / md, j / nd, two_sided)) u[j] = 1.;
1233                   else {
1234
10002
                       double v = (double)(i) / (double)(i + j);
1235
10002
                       double w = (double)(j) / (double)(i + j);
1236
10002
                       u[j] = v * u[j] + w * u[j - 1];
1237                   }
1238          }
1239        }
1240
9
        double res = u[n];
1241
9
        Safefree(u);
1242
9
        return res;
1243}
1244
1245
687
static double p_body(double n, double delta, double sd, double sig_level, int tsample, int tside, bool strict) {
1246
687
        double nu = (n - 1.0) * (double)tsample;
1247
687
        if (nu < 1e-7) nu = 1e-7;
1248
1249        // Ensure sig_level/tside is not truncated
1250
687
        double p_tail = sig_level / (double)tside;
1251
687
        double qu = qt_tail(nu, p_tail); // qt(p, df, lower.tail=FALSE)
1252
1253
687
        double ncp = sqrt(n / (double)tsample) * (delta / sd);
1254
1255
687
        if (strict && tside == 2) {
1256          // Use R-style tail calls: 1 - P(T < qu) + P(T < -qu)
1257
0
          return (1.0 - exact_pnt(qu, nu, ncp)) + exact_pnt(-qu, nu, ncp);
1258        } else {
1259          // Default: 1 - P(T < qu)
1260          // Ensure exact_pnt is using a convergence tolerance of at least 1e-15
1261
687
          return 1.0 - exact_pnt(qu, nu, ncp);
1262        }
1263}
1264// --- XS SECTION ---
1265MODULE = Stats::LikeR  PACKAGE = Stats::LikeR
1266
1267SV* ks_test(...)
1268CODE:
1269{
1270
12
        SV *restrict x_sv = NULL, *restrict y_sv = NULL;
1271
12
        short int exact = -1;
1272
12
        const char *restrict alternative = "two.sided";
1273
12
        int arg_idx = 0;
1274
1275        // Shift arrays if provided positionally
1276
12
        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
1277
12
                x_sv = ST(arg_idx);
1278
12
                arg_idx++;
1279        }
1280        // Check if second argument is an array (2-sample) or a string representing a CDF (1-sample)
1281
12
        if (arg_idx < items) {
1282
12
                if (SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
1283
9
                        y_sv = ST(arg_idx);
1284
9
                        arg_idx++;
1285
3
                } else if (SvPOK(ST(arg_idx))) {
1286
3
                        y_sv = ST(arg_idx); // Save string (e.g., "pnorm") for 1-sample test logic
1287
3
                        arg_idx++;
1288                }
1289        }
1290
1291        // Parse named arguments
1292
18
        for (; arg_idx < items; arg_idx += 2) {
1293
6
          const char *restrict key = SvPV_nolen(ST(arg_idx));
1294
6
          SV *restrict val = ST(arg_idx + 1);
1295
6
          if      (strEQ(key, "x"))           x_sv = val;
1296
6
          else if (strEQ(key, "y"))           y_sv = val;
1297
6
          else if (strEQ(key, "exact"))       {
1298
0
                   if (!SvOK(val)) exact = -1;
1299
0
                   else exact = SvTRUE(val) ? 1 : 0;
1300          }
1301
6
          else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
1302
0
          else croak("ks_test: unknown argument '%s'", key);
1303        }
1304
1305
12
        if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) {
1306
0
          croak("ks_test: 'x' is a required argument and must be an ARRAY reference");
1307        }
1308
1309
12
        bool is_two_sided = strEQ(alternative, "two.sided") ? 1 : 0;
1310
12
        bool is_greater   = strEQ(alternative, "greater") ? 1 : 0;
1311
12
        bool is_less      = strEQ(alternative, "less") ? 1 : 0;
1312
1313
12
        if (!is_two_sided && !is_greater && !is_less) {
1314
0
          croak("ks_test: alternative must be 'two.sided', 'less', or 'greater'");
1315        }
1316
1317
12
        AV *restrict x_av = (AV*)SvRV(x_sv);
1318
12
        size_t nx = av_len(x_av) + 1;
1319
12
        if (nx == 0) croak("Not enough 'x' observations");
1320
1321        // Extract 'x' array to C-array
1322
12
        double *restrict x_data = (double *)safemalloc(nx * sizeof(double));
1323
12
        size_t valid_nx = 0;
1324
612
        for (size_t i = 0; i < nx; i++) {
1325
600
          SV**restrict el = av_fetch(x_av, i, 0);
1326
600
          if (el && SvOK(*el) && looks_like_number(*el)) {
1327
600
                   x_data[valid_nx++] = SvNV(*el);
1328          }
1329        }
1330
1331
12
        double statistic = 0.0, p_value = 0.0;
1332
12
        const char *restrict method_desc = "";
1333
1334        // --- TWO SAMPLE ---
1335
21
        if (y_sv && SvROK(y_sv) && SvTYPE(SvRV(y_sv)) == SVt_PVAV) {
1336
9
          AV *restrict y_av = (AV*)SvRV(y_sv);
1337
9
          size_t ny = av_len(y_av) + 1;
1338
1339
9
          double *restrict y_data = (double *)safemalloc(ny * sizeof(double));
1340
9
          size_t valid_ny = 0;
1341
279
          for (size_t i = 0; i < ny; i++) {
1342
270
                   SV**restrict el = av_fetch(y_av, i, 0);
1343
270
                   if (el && SvOK(*el) && looks_like_number(*el)) {
1344
270
                       y_data[valid_ny++] = SvNV(*el);
1345                   }
1346          }
1347
1348
9
          if (valid_nx < 1 || valid_ny < 1) {
1349
0
                   Safefree(x_data); Safefree(y_data);
1350
0
                   croak("Not enough non-missing observations for KS test");
1351          }
1352
1353          double d, d_plus, d_minus;
1354
9
          calc_2sample_stats(x_data, valid_nx, y_data, valid_ny, &d, &d_plus, &d_minus);
1355
1356          // Map alternative to the correct statistic
1357
9
          if (is_greater) statistic = d_plus;
1358
6
          else if (is_less) statistic = d_minus;
1359
3
          else statistic = d;
1360
1361          // Determine if exact or asymptotic
1362
9
          bool use_exact = false;
1363
9
          if (exact == 1) use_exact = true;
1364
9
          else if (exact == 0) use_exact = false;
1365
9
          else use_exact = (valid_nx * valid_ny < 10000);
1366
1367          // Check for ties in combined set
1368
9
          size_t total_n = valid_nx + valid_ny;
1369
9
          double *restrict comb = (double *)safemalloc(total_n * sizeof(double));
1370
459
          for(size_t i=0; i<valid_nx; i++) comb[i] = x_data[i];
1371
279
          for(size_t i=0; i<valid_ny; i++) comb[valid_nx+i] = y_data[i];
1372
9
          qsort(comb, total_n, sizeof(double), compare_doubles);
1373
1374
9
          bool has_ties = false;
1375
720
          for(size_t i = 1; i < total_n; i++) {
1376
711
                   if(comb[i] == comb[i-1]) { has_ties = true; break; }
1377          }
1378
9
          Safefree(comb);
1379
9
          if (use_exact && has_ties) {
1380
0
                   warn("cannot compute exact p-value with ties; falling back to asymptotic");
1381
0
                   use_exact = false;
1382          }
1383
9
          if (use_exact) {
1384
9
                   method_desc = "Two-sample Kolmogorov-Smirnov exact test";
1385
9
                   double q = (0.5 + floor(statistic * valid_nx * valid_ny - 1e-7)) / ((double)valid_nx * valid_ny);
1386
9
                   p_value = psmirnov_exact_uniq_upper(q, valid_nx, valid_ny, is_two_sided);
1387          } else {
1388
0
                   method_desc = "Two-sample Kolmogorov-Smirnov test (asymptotic)";
1389
0
                   double z = statistic * sqrt((double)(valid_nx * valid_ny) / (valid_nx + valid_ny));
1390
0
                   if (is_two_sided) {
1391
0
                       p_value = K2l(z, 0, 1e-9);
1392                   } else {
1393
0
                       p_value = exp(-2.0 * z * z); // One-sided limit distribution
1394                   }
1395          }
1396
9
          Safefree(y_data);
1397        }
1398        // --- ONE SAMPLE (e.g. against pnorm) ---
1399
6
        else if (y_sv && SvPOK(y_sv)) {
1400
3
          const char *restrict dist = SvPV_nolen(y_sv);
1401
3
          if (strEQ(dist, "pnorm")) {
1402
3
                   qsort(x_data, valid_nx, sizeof(double), compare_doubles);
1403
3
                   double max_d = 0.0, max_d_plus = 0.0, max_d_minus = 0.0;
1404
153
                   for(size_t i = 0; i < valid_nx; i++) {
1405
150
                       double cdf_obs_low  = (double)i / valid_nx;
1406
150
                       double cdf_obs_high = (double)(i + 1) / valid_nx;
1407
150
                       double cdf_theor    = approx_pnorm(x_data[i]);
1408
1409
150
                       double diff1 = cdf_obs_low - cdf_theor;
1410
150
                       double diff2 = cdf_obs_high - cdf_theor;
1411
1412
150
                       if (diff1 > max_d_plus) max_d_plus = diff1;
1413
150
                       if (diff2 > max_d_plus) max_d_plus = diff2;
1414
150
                       if (-diff1 > max_d_minus) max_d_minus = -diff1;
1415
150
                       if (-diff2 > max_d_minus) max_d_minus = -diff2;
1416
1417
150
                       if (fabs(diff1) > max_d) max_d = fabs(diff1);
1418
150
                       if (fabs(diff2) > max_d) max_d = fabs(diff2);
1419                   }
1420
3
                   if (is_greater) statistic = max_d_plus;
1421
3
                   else if (is_less) statistic = max_d_minus;
1422
3
                   else statistic = max_d;
1423
3
                   bool use_exact = (exact == -1) ? (valid_nx < 100) : (exact == 1);
1424
3
                   if (use_exact) {
1425
3
                       method_desc = "One-sample Kolmogorov-Smirnov exact test";
1426
3
                       if (is_two_sided) {
1427
3
                           p_value = 1.0 - K2x(valid_nx, statistic);
1428                       } else {
1429
0
                           warn("exact 1-sample 1-sided KS test not implemented; using asymptotic");
1430
0
                           double z = statistic * sqrt((double)valid_nx);
1431
0
                           p_value = exp(-2.0 * z * z);
1432                       }
1433                   } else {
1434
0
                       method_desc = "One-sample Kolmogorov-Smirnov test (asymptotic)";
1435
0
                       double z = statistic * sqrt((double)valid_nx);
1436
0
                       if (is_two_sided) p_value = K2l(z, 0, 1e-6);
1437
0
                       else p_value = exp(-2.0 * z * z);
1438                   }
1439          } else {
1440
0
                    Safefree(x_data);
1441
0
                    croak("ks_test: Unsupported 1-sample distribution '%s'. Use arrays for 2-sample.", dist);
1442          }
1443        } else {
1444
0
          Safefree(x_data);
1445
0
          croak("ks_test: Invalid arguments for 'y'.");
1446        }
1447
12
        Safefree(x_data);
1448
12
        if (p_value > 1.0) p_value = 1.0;
1449
12
        if (p_value < 0.0) p_value = 0.0;
1450
12
        HV *restrict res = newHV();
1451
12
        hv_stores(res, "statistic", newSVnv(statistic));
1452
12
        hv_stores(res, "p_value", newSVnv(p_value));
1453
12
        hv_stores(res, "method", newSVpv(method_desc, 0));
1454
12
        hv_stores(res, "alternative", newSVpv(alternative, 0));
1455
12
        RETVAL = newRV_noinc((SV*)res);
1456}
1457OUTPUT:
1458    RETVAL
1459
1460SV* wilcox_test(...)
1461CODE:
1462{
1463
30
        SV *restrict x_sv = NULL, *restrict y_sv = NULL;
1464
30
        bool paired = false, correct = true;
1465
30
        double mu = 0.0;
1466
30
        short int exact = -1;
1467
30
        const char *restrict alternative = "two.sided";
1468
30
        int arg_idx = 0;
1469        // 1. Shift first positional argument as 'x' if it's an array reference
1470
30
        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
1471
6
                x_sv = ST(arg_idx);
1472
6
                arg_idx++;
1473        }
1474        // 2. Shift second positional argument as 'y' if it's an array reference
1475
30
        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
1476
6
                y_sv = ST(arg_idx);
1477
6
                arg_idx++;
1478        }
1479        // Ensure the remaining arguments form complete key-value pairs
1480
30
        if ((items - arg_idx) % 2 != 0) {
1481
0
                croak("Usage: wilcox_test(\\@x, [\\@y], key => value, ...)");
1482        }
1483        // --- Parse named arguments from the remaining flat stack ---
1484
90
        for (; arg_idx < items; arg_idx += 2) {
1485
60
                const char *restrict key = SvPV_nolen(ST(arg_idx));
1486
60
                SV *restrict val = ST(arg_idx + 1);
1487
60
                if      (strEQ(key, "x"))           x_sv = val;
1488
39
                else if (strEQ(key, "y"))           y_sv = val;
1489
18
                else if (strEQ(key, "paired"))      paired = SvTRUE(val);
1490
9
                else if (strEQ(key, "correct"))     correct = SvTRUE(val);
1491
9
                else if (strEQ(key, "mu"))          mu = SvNV(val);
1492
6
                else if (strEQ(key, "exact"))       {
1493
0
                        if (!SvOK(val)) exact = -1;
1494
0
                        else exact = SvTRUE(val) ? 1 : 0;
1495                }
1496
6
                else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
1497
0
                else croak("wilcox_test: unknown argument '%s'", key);
1498        }
1499        // --- Validate required / types ---
1500
30
        if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
1501
3
                croak("wilcox_test: 'x' is a required argument and must be an ARRAY reference");
1502
27
        AV *restrict x_av = (AV*)SvRV(x_sv);
1503
27
        size_t nx = av_len(x_av) + 1;
1504
27
        if (nx == 0) croak("Not enough 'x' observations");
1505
1506
27
        AV *restrict y_av = NULL;
1507
27
        size_t ny = 0;
1508
27
        if (y_sv && SvROK(y_sv) && SvTYPE(SvRV(y_sv)) == SVt_PVAV) {
1509
24
                y_av = (AV*)SvRV(y_sv);
1510
24
                ny = av_len(y_av) + 1;
1511        }
1512
27
        double p_value = 0.0, statistic = 0.0;
1513
27
        const char *restrict method_desc = "";
1514
27
        bool use_exact = false;
1515        // --- TWO SAMPLE (Mann-Whitney) ---
1516
42
        if (ny > 0 && !paired) {
1517
15
                RankInfo *restrict ri = (RankInfo *)safemalloc((nx + ny) * sizeof(RankInfo));
1518
15
                size_t valid_nx = 0, valid_ny = 0;
1519
99
                for (size_t i = 0; i < nx; i++) {
1520
84
                        SV**restrict el = av_fetch(x_av, i, 0);
1521
84
                        if (el && SvOK(*el) && looks_like_number(*el)) {
1522
84
                                ri[valid_nx].val = SvNV(*el) - mu; // R subtracts mu from x
1523
84
                                ri[valid_nx].idx = 1;
1524
84
                                valid_nx++;
1525                        }
1526                }
1527
99
                for (size_t i = 0; i < ny; i++) {
1528
84
                        SV**restrict el = av_fetch(y_av, i, 0);
1529
84
                        if (el && SvOK(*el) && looks_like_number(*el)) {
1530
84
                                ri[valid_nx + valid_ny].val = SvNV(*el);
1531
84
                                ri[valid_nx + valid_ny].idx = 2;
1532
84
                                valid_ny++;
1533                        }
1534                }
1535
15
                if (valid_nx == 0) { Safefree(ri); croak("not enough (non-missing) 'x' observations"); }
1536
15
                if (valid_ny == 0) { Safefree(ri); croak("not enough 'y' observations"); }
1537
15
                size_t total_n = valid_nx + valid_ny;
1538
15
                bool has_ties = 0;
1539
15
                double tie_adj = rank_and_count_ties(ri, total_n, &has_ties);
1540
15
                double w_rank_sum = 0.0;
1541
183
                for (size_t i = 0; i < total_n; i++) if (ri[i].idx == 1) w_rank_sum += ri[i].rank;
1542
15
                statistic = w_rank_sum - (double)valid_nx * (valid_nx + 1.0) / 2.0;
1543
1544
15
                if (exact == 1) use_exact = true;
1545
15
                else if (exact == 0) use_exact = false;
1546
15
                else use_exact = (valid_nx < 50 && valid_ny < 50 && !has_ties);
1547
1548
15
                if (use_exact && has_ties) {
1549
0
                        warn("cannot compute exact p-value with ties; falling back to approximation");
1550
0
                        use_exact = false;
1551                }
1552
15
                if (use_exact) {
1553
6
                        method_desc = "Wilcoxon rank sum exact test";
1554
6
                        double p_less = exact_pwilcox(statistic, valid_nx, valid_ny);
1555
6
                        double p_greater = 1.0 - exact_pwilcox(statistic - 1.0, valid_nx, valid_ny);
1556
1557
6
                        if (strcmp(alternative, "less") == 0) p_value = p_less;
1558
3
                        else if (strcmp(alternative, "greater") == 0) p_value = p_greater;
1559                        else {
1560
0
                                double p = (p_less < p_greater) ? p_less : p_greater;
1561
0
                                p_value = 2.0 * p;
1562                        }
1563                } else {
1564
9
                        method_desc = correct ? "Wilcoxon rank sum test with continuity correction" : "Wilcoxon rank sum test";
1565
9
                        double exp = (double)valid_nx * valid_ny / 2.0;
1566
9
                        double var = ((double)valid_nx * valid_ny / 12.0) * ((total_n + 1.0) - tie_adj / (total_n * (total_n - 1.0)));
1567
9
                        double z = statistic - exp;
1568
1569
9
                        double CORRECTION = 0.0;
1570
9
                        if (correct) {
1571
9
                                if (strcmp(alternative, "two.sided") == 0) CORRECTION = (z > 0 ? 0.5 : -0.5);
1572
0
                                else if (strcmp(alternative, "greater") == 0) CORRECTION = 0.5;
1573
0
                                else if (strcmp(alternative, "less") == 0) CORRECTION = -0.5;
1574                        }
1575
9
                        z = (z - CORRECTION) / sqrt(var);
1576
1577
9
                        if (strcmp(alternative, "less") == 0) p_value = approx_pnorm(z);
1578
9
                        else if (strcmp(alternative, "greater") == 0) p_value = 1.0 - approx_pnorm(z);
1579
9
                        else p_value = 2.0 * approx_pnorm(-fabs(z));
1580                }
1581
15
                Safefree(ri);
1582        } else { // --- ONE SAMPLE / PAIRED ---
1583
12
                if (paired && (!y_av || nx != ny)) croak("'x' and 'y' must have the same length for paired test");
1584
9
                double *restrict diffs = (double *)safemalloc(nx * sizeof(double));
1585
9
                size_t n_nz = 0;
1586
9
                bool has_zeroes = false;
1587
78
                for (size_t i = 0; i < nx; i++) {
1588
69
                        SV**restrict x_el = av_fetch(x_av, i, 0);
1589
69
                        if (!x_el || !SvOK(*x_el) || !looks_like_number(*x_el)) continue;
1590
69
                        double dx = SvNV(*x_el);
1591
1592
69
                        if (paired) {
1593
54
                                SV**restrict y_el = av_fetch(y_av, i, 0);
1594
54
                                if (!y_el || !SvOK(*y_el) || !looks_like_number(*y_el)) continue;
1595
54
                                double dy = SvNV(*y_el);
1596
54
                                double d = dx - dy - mu;
1597
54
                                if (d == 0.0) has_zeroes = true; // Drop exact zeroes
1598
54
                                else diffs[n_nz++] = d;
1599                        } else {
1600
15
                                double d = dx - mu;
1601
15
                                if (d == 0.0) has_zeroes = true;
1602
15
                                else diffs[n_nz++] = d;
1603                        }
1604                }
1605
9
                if (n_nz == 0) {
1606
0
                        Safefree(diffs);
1607
0
                        croak("not enough (non-missing) observations");
1608                }
1609
9
                RankInfo *ri = (RankInfo *)safemalloc(n_nz * sizeof(RankInfo));
1610
78
                for (size_t i = 0; i < n_nz; i++) {
1611
69
                        ri[i].val = fabs(diffs[i]);
1612
69
                        ri[i].idx = (diffs[i] > 0);
1613                }
1614
9
                bool has_ties = 0;
1615
9
                double tie_adj = rank_and_count_ties(ri, n_nz, &has_ties);
1616
9
                statistic = 0.0;
1617
78
                for (size_t i = 0; i < n_nz; i++) {
1618
69
                        if (ri[i].idx) statistic += ri[i].rank;
1619                }
1620
9
                if (exact == 1) use_exact = true;
1621
9
                else if (exact == 0) use_exact = false;
1622
9
                else use_exact = (n_nz < 50 && !has_ties);
1623
9
                if (use_exact && has_ties) {
1624
0
                        warn("cannot compute exact p-value with ties; falling back to approximation");
1625
0
                        use_exact = false;
1626                }
1627
9
                if (use_exact && has_zeroes) {
1628
0
                        warn("cannot compute exact p-value with zeroes; falling back to approximation");
1629
0
                        use_exact = false;
1630                }
1631
9
                if (use_exact) {
1632
9
                        method_desc = paired ? "Wilcoxon exact signed rank test" : "Wilcoxon exact signed rank test";
1633
9
                        double p_less = exact_psignrank(statistic, n_nz);
1634
9
                        double p_greater = 1.0 - exact_psignrank(statistic - 1.0, n_nz);
1635
1636
9
                        if (strcmp(alternative, "less") == 0) p_value = p_less;
1637
9
                        else if (strcmp(alternative, "greater") == 0) p_value = p_greater;
1638                        else {
1639
9
                                double p = (p_less < p_greater) ? p_less : p_greater;
1640
9
                                p_value = 2.0 * p;
1641                        }
1642                } else {
1643
0
                        method_desc = correct ? "Wilcoxon signed rank test with continuity correction" : "Wilcoxon signed rank test";
1644
0
                        double exp = (double)n_nz * (n_nz + 1.0) / 4.0;
1645
0
                        double var = (n_nz * (n_nz + 1.0) * (2.0 * n_nz + 1.0) / 24.0) - (tie_adj / 48.0);
1646
0
                        double z = statistic - exp;
1647
0
                        double CORRECTION = 0.0;
1648
0
                        if (correct) {
1649
0
                                if (strcmp(alternative, "two.sided") == 0) CORRECTION = (z > 0 ? 0.5 : -0.5);
1650
0
                                else if (strcmp(alternative, "greater") == 0) CORRECTION = 0.5;
1651
0
                                else if (strcmp(alternative, "less") == 0) CORRECTION = -0.5;
1652                        }
1653
0
                        z = (z - CORRECTION) / sqrt(var);
1654
1655
0
                        if (strcmp(alternative, "less") == 0) p_value = approx_pnorm(z);
1656
0
                        else if (strcmp(alternative, "greater") == 0) p_value = 1.0 - approx_pnorm(z);
1657
0
                        else p_value = 2.0 * approx_pnorm(-fabs(z));
1658                }
1659
9
                Safefree(ri); Safefree(diffs);
1660        }
1661
24
        if (p_value > 1.0) p_value = 1.0;
1662
24
        HV *restrict res = newHV();
1663
24
        hv_stores(res, "statistic", newSVnv(statistic));
1664
24
        hv_stores(res, "p_value", newSVnv(p_value));
1665
24
        hv_stores(res, "method", newSVpv(method_desc, 0));
1666
24
        hv_stores(res, "alternative", newSVpv(alternative, 0));
1667
24
        RETVAL = newRV_noinc((SV*)res);
1668}
1669OUTPUT:
1670        RETVAL
1671
1672SV* _chisq_c(data_ref)
1673    SV* data_ref;
1674CODE:
1675{
1676
9
        AV*restrict obs_av = (AV*)SvRV(data_ref);
1677
9
        int r = av_top_index(obs_av) + 1, c = 0;
1678
9
        bool is_2d = 0;
1679
9
        SV**restrict first_elem = av_fetch(obs_av, 0, 0);
1680
9
        if (first_elem && SvROK(*first_elem) && SvTYPE(SvRV(*first_elem)) == SVt_PVAV) {
1681
6
                is_2d = 1;
1682
6
                AV*restrict first_row = (AV*)SvRV(*first_elem);
1683
6
                c = av_top_index(first_row) + 1;
1684        } else {
1685
3
                c = r;
1686
3
                r = 1;
1687        }
1688
1689
9
        double stat = 0.0, grand_total = 0.0;
1690
9
        int df = 0;
1691
9
        int yates = (is_2d && r == 2 && c == 2) ? 1 : 0;
1692
1693
9
        AV*restrict expected_av = newAV();
1694
9
        if (is_2d) {
1695
6
                double *restrict row_sum = (double*)safemalloc(r * sizeof(double));
1696
6
                double *restrict col_sum = (double*)safemalloc(c * sizeof(double));
1697
18
                for(unsigned int i=0; i<r; i++) row_sum[i] = 0.0;
1698
21
                for(unsigned int j=0; j<c; j++) col_sum[j] = 0.0;
1699
18
                for (unsigned int i = 0; i < r; i++) {
1700
12
                        SV**restrict row_sv = av_fetch(obs_av, i, 0);
1701
12
                        AV*restrict row = (AV*)SvRV(*row_sv);
1702
42
                        for (unsigned int j = 0; j < c; j++) {
1703
30
                                 SV**restrict val_sv = av_fetch(row, j, 0);
1704
30
                                 double val = SvNV(*val_sv);
1705
30
                                 row_sum[i] += val;
1706
30
                                 col_sum[j] += val;
1707
30
                                 grand_total += val;
1708                        }
1709                }
1710
18
                for (unsigned int i = 0; i < r; i++) {
1711
12
                        AV*restrict exp_row = newAV();
1712
12
                        SV**restrict row_sv = av_fetch(obs_av, i, 0);
1713
12
                        AV*restrict row = (AV*)SvRV(*row_sv);
1714
42
                        for (unsigned int j = 0; j < c; j++) {
1715
30
                                double E = (row_sum[i] * col_sum[j]) / grand_total;
1716
30
                                SV**restrict val_sv = av_fetch(row, j, 0);
1717
30
                                double O = SvNV(*val_sv);
1718
30
                                av_push(exp_row, newSVnv(E));
1719
30
                                if (yates) {
1720                                  // Exact R logic: min(0.5, abs(O - E))
1721
12
                                  double abs_diff = fabs(O - E);
1722
12
                                  double y_corr = (abs_diff > 0.5) ? 0.5 : abs_diff;
1723
12
                                  double diff = abs_diff - y_corr;
1724
12
                                  stat += (diff * diff) / E;
1725                                } else {
1726
18
                                  stat += ((O - E) * (O - E)) / E;
1727                                }
1728                        }
1729
12
                        av_push(expected_av, newRV_noinc((SV*)exp_row));
1730                }
1731
6
                safefree(row_sum); safefree(col_sum);
1732
6
                df = (r - 1) * (c - 1);
1733        } else {
1734
12
          for (unsigned int j = 0; j < c; j++) {
1735
9
                   SV**restrict val_sv = av_fetch(obs_av, j, 0);
1736
9
                   grand_total += SvNV(*val_sv);
1737          }
1738
3
          double E = grand_total / (double)c;
1739
12
          for (unsigned int j = 0; j < c; j++) {
1740
9
                   SV**restrict val_sv = av_fetch(obs_av, j, 0);
1741
9
                   double O = SvNV(*val_sv);
1742
9
                   av_push(expected_av, newSVnv(E));
1743
9
                   stat += ((O - E) * (O - E)) / E;
1744          }
1745
3
          df = c - 1;
1746        }
1747
9
        double p_val = get_p_value(stat, df);
1748
9
        HV*restrict results = newHV();
1749
9
        hv_store(results, "statistic", 9, newSVnv(stat), 0);
1750
9
        hv_store(results, "df", 2, newSViv(df), 0);
1751
9
        hv_store(results, "p_value", 7, newSVnv(p_val), 0);
1752
9
        hv_store(results, "expected", 8, newRV_noinc((SV*)expected_av), 0);
1753
9
        if (is_2d) {
1754
6
                if (yates) {
1755
3
                        hv_store(results, "method", 6, newSVpv("Pearson's Chi-squared test with Yates' continuity correction", 0), 0);
1756                } else {
1757
3
                        hv_store(results, "method", 6, newSVpv("Pearson's Chi-squared test", 0), 0);
1758                }
1759        } else {
1760
3
          hv_store(results, "method", 6, newSVpv("Chi-squared test for given probabilities", 0), 0);
1761        }
1762
9
        RETVAL = newRV_noinc((SV*)results);
1763}
1764OUTPUT:
1765        RETVAL
1766
1767PROTOTYPES: ENABLE
1768
1769void write_table(...)
1770PPCODE:
1771{
1772
37
        SV *restrict data_sv = NULL;
1773
37
        SV *restrict file_sv = NULL;
1774
37
        unsigned int arg_idx = 0;
1775
1776        // Mimic the Perl shift logic
1777
37
        if (arg_idx < items && SvROK(ST(arg_idx))) {
1778
37
          int type = SvTYPE(SvRV(ST(arg_idx)));
1779
37
          if (type == SVt_PVHV || type == SVt_PVAV) {
1780
37
                   data_sv = ST(arg_idx);
1781
37
                   arg_idx++;
1782          }
1783        }
1784
37
        if (arg_idx < items) {
1785
37
          file_sv = ST(arg_idx);
1786
37
          arg_idx++;
1787        }
1788
1789
37
        const char *restrict sep = ",";
1790
37
        SV *restrict row_names_sv = sv_2mortal(newSViv(1));
1791
37
        SV *restrict col_names_sv = NULL;
1792
1793        // Read the remaining Hash-style arguments
1794
104
        for (; arg_idx < items; arg_idx += 2) {
1795
67
          if (arg_idx + 1 >= items) croak("write_table: Odd number of arguments passed");
1796
67
          const char *restrict key = SvPV_nolen(ST(arg_idx));
1797
67
          SV *restrict val = ST(arg_idx + 1);
1798
67
          if (strEQ(key, "data")) data_sv = val;
1799
67
          else if (strEQ(key, "col.names")) col_names_sv = val;
1800
55
          else if (strEQ(key, "file")) file_sv = val;
1801
55
          else if (strEQ(key, "row.names")) row_names_sv = val;
1802
31
          else if (strEQ(key, "sep")) sep = SvPV_nolen(val);
1803
0
          else croak("write_table: Unknown arguments passed: %s", key);
1804        }
1805
1806
37
        if (!data_sv || !SvROK(data_sv)) {
1807
0
          croak("write_table: 'data' must be a HASH or ARRAY reference\n");
1808        }
1809
37
        SV *restrict data_ref = SvRV(data_sv);
1810
37
        if (SvTYPE(data_ref) != SVt_PVHV && SvTYPE(data_ref) != SVt_PVAV) {
1811
0
          croak("write_table: 'data' must be a HASH or ARRAY reference\n");
1812        }
1813
1814
37
        if (!file_sv || !SvOK(file_sv)) croak("write_table: file name missing\n");
1815
37
        const char *restrict file = SvPV_nolen(file_sv);
1816
1817
37
        if (col_names_sv && SvOK(col_names_sv)) {
1818
12
          if (!SvROK(col_names_sv) || SvTYPE(SvRV(col_names_sv)) != SVt_PVAV) {
1819
3
                   croak("write_table: 'col.names' must be an ARRAY reference\n");
1820          }
1821        }
1822
1823
34
        bool is_hoh = 0, is_hoa = 0, is_aoh = 0;
1824
34
        AV *restrict rows_av = NULL;
1825
1826        // Validate Input Structures & Homogeneity
1827
34
        if (SvTYPE(data_ref) == SVt_PVHV) {
1828
25
                HV *restrict hv = (HV*)data_ref;
1829
25
                if (hv_iterinit(hv) == 0) XSRETURN_EMPTY;
1830
1831
25
                HE *restrict entry = hv_iternext(hv);
1832
25
                SV *restrict first_val = hv_iterval(hv, entry);
1833
25
                if (!first_val || !SvROK(first_val)) {
1834
0
                        croak("write_table: Data values must be either all HASHes or all ARRAYs\n");
1835                }
1836
25
                int first_type = SvTYPE(SvRV(first_val));
1837
25
                if (first_type != SVt_PVHV && first_type != SVt_PVAV) {
1838
0
                        croak("write_table: Data values must be either all HASHes or all ARRAYs\n");
1839                }
1840
1841
25
                is_hoh = (first_type == SVt_PVHV);
1842
25
                is_hoa = (first_type == SVt_PVAV);
1843
1844
25
                hv_iterinit(hv);
1845
88
                while ((entry = hv_iternext(hv))) {
1846
63
                        SV *restrict val = hv_iterval(hv, entry);
1847
63
                        if (!val || !SvROK(val) || SvTYPE(SvRV(val)) != first_type) {
1848
0
                                 croak("write_table: Mixed data types detected. Ensure all values are %s references.\n", is_hoh ? "HASH" : "ARRAY");
1849                        }
1850                }
1851
1852
25
                if (is_hoh) {
1853
12
                        rows_av = newAV();
1854
12
                        hv_iterinit(hv);
1855
36
                        while ((entry = hv_iternext(hv))) {
1856
24
                                 av_push(rows_av, newSVsv(hv_iterkeysv(entry)));
1857                        }
1858                }
1859        } else {
1860
9
                AV *restrict av = (AV*)data_ref;
1861
9
                if (av_len(av) < 0) XSRETURN_EMPTY;
1862
9
                SV **restrict first_ptr = av_fetch(av, 0, 0);
1863
9
                if (!first_ptr || !*first_ptr || !SvROK(*first_ptr) || SvTYPE(SvRV(*first_ptr)) != SVt_PVHV) {
1864
0
                        croak("write_table: For ARRAY data, all elements must be HASH references (Array of Hashes)\n");
1865                }
1866
1867
30
                for (size_t i = 0; i <= av_len(av); i++) {
1868
21
                        SV **restrict ptr = av_fetch(av, i, 0);
1869
21
                        if (!ptr || !*ptr || !SvROK(*ptr) || SvTYPE(SvRV(*ptr)) != SVt_PVHV) {
1870
0
                                 croak("write_table: Mixed data types detected in Array of Hashes. All elements must be HASH references.\n");
1871                        }
1872                }
1873
9
                is_aoh = 1;
1874        }
1875
1876
34
        PerlIO *restrict fh = PerlIO_open(file, "w");
1877
34
        if (!fh) croak("write_table: Could not open '%s' for writing", file);
1878
1879
34
        AV *restrict headers_av = newAV();
1880
34
        int inc_rownames = (row_names_sv && SvTRUE(row_names_sv)) ? 1 : 0;
1881
34
        const char *restrict rownames_col = NULL;
1882
1883        // ----- Hash of Hashes -----
1884
34
        if (is_hoh) {
1885
15
                if (col_names_sv && SvOK(col_names_sv)) {
1886
3
                        AV *restrict c_av = (AV*)SvRV(col_names_sv);
1887
12
                        for(size_t i=0; i<=av_len(c_av); i++) {
1888
9
                                SV **restrict c = av_fetch(c_av, i, 0);
1889
9
                                if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c));
1890                        }
1891                } else {
1892
9
                        HV *restrict col_map = newHV();
1893
9
                        hv_iterinit((HV*)data_ref);
1894                        HE *restrict entry;
1895
27
                        while((entry = hv_iternext((HV*)data_ref))) {
1896
18
                                 HV *restrict inner = (HV*)SvRV(hv_iterval((HV*)data_ref, entry));
1897
18
                                 hv_iterinit(inner);
1898                                 HE *restrict inner_entry;
1899
54
                                 while((inner_entry = hv_iternext(inner))) {
1900
36
                                     hv_store_ent(col_map, hv_iterkeysv(inner_entry), newSViv(1), 0);
1901                                 }
1902                        }
1903
9
                        unsigned num_cols = hv_iterinit(col_map);
1904
9
                        const char **restrict col_array = safemalloc(num_cols * sizeof(char*));
1905
36
                        for(unsigned i=0; i<num_cols; i++) {
1906
27
                                 HE *restrict ce = hv_iternext(col_map);
1907
27
                                 col_array[i] = SvPV_nolen(hv_iterkeysv(ce));
1908                        }
1909
9
                        qsort(col_array, num_cols, sizeof(char*), cmp_string_wt);
1910
36
                        for(unsigned i=0; i<num_cols; i++) av_push(headers_av, newSVpv(col_array[i], 0));
1911
9
                        safefree(col_array);
1912
9
                        SvREFCNT_dec(col_map);
1913          }
1914
12
          size_t num_headers = av_len(headers_av) + 1;
1915
12
          const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*));
1916
1917
12
          size_t h_idx = 0;
1918
12
          if (inc_rownames) header_row[h_idx++] = "";
1919
48
          for(unsigned short int i=0; i<num_headers; i++) {
1920
36
                   SV**restrict h_ptr = av_fetch(headers_av, i, 0);
1921
36
                   header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
1922          }
1923
12
          print_string_row(fh, header_row, h_idx, sep);
1924
12
          safefree(header_row);
1925
1926
12
          size_t num_rows = av_len(rows_av) + 1;
1927
12
          const char **restrict row_array = safemalloc(num_rows * sizeof(char*));
1928
36
          for(size_t i=0; i<num_rows; i++) {
1929
24
                row_array[i] = SvPV_nolen(*av_fetch(rows_av, i, 0));
1930          }
1931
12
          qsort(row_array, num_rows, sizeof(char*), cmp_string_wt);
1932
1933
12
          HV *restrict data_hv = (HV*)data_ref;
1934
12
          const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*));
1935
1936
33
          for(size_t i=0; i<num_rows; i++) {
1937
24
                   size_t d_idx = 0;
1938
24
                   if (inc_rownames) row_data[d_idx++] = row_array[i];
1939
1940
24
                   SV **restrict inner_hv_ptr = hv_fetch(data_hv, row_array[i], strlen(row_array[i]), 0);
1941
24
                   HV *restrict inner_hv = inner_hv_ptr ? (HV*)SvRV(*inner_hv_ptr) : NULL;
1942
1943
99
                   for(size_t j=0; j<num_headers; j++) {
1944
78
                       SV**restrict h_ptr = av_fetch(headers_av, j, 0);
1945
78
                       const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
1946
78
                       SV **restrict cell_ptr = inner_hv ? hv_fetch(inner_hv, col_name, strlen(col_name), 0) : NULL;
1947
78
                       if (cell_ptr && SvOK(*cell_ptr)) {
1948
45
                           if (SvROK(*cell_ptr)) {
1949
3
                               PerlIO_close(fh);
1950
3
                               safefree(row_array);
1951
3
                     safefree(row_data);
1952
3
                       if (headers_av) SvREFCNT_dec(headers_av);
1953
3
                       if (rows_av) SvREFCNT_dec(rows_av);
1954
3
                               croak("write_table: Cannot write nested reference types to table\n");
1955                           }
1956
42
                           row_data[d_idx++] = SvPV_nolen(*cell_ptr);
1957                       } else {
1958
33
                           row_data[d_idx++] = "NA";
1959                       }
1960                   }
1961
21
                   print_string_row(fh, row_data, d_idx, sep);
1962          }
1963
9
          safefree(row_array); safefree(row_data);
1964
1965
22
        } else if (is_hoa) { // ----- Hash of Arrays -----
1966
13
                HV *restrict data_hv = (HV*)data_ref;
1967
13
                size_t max_rows = 0;
1968
13
                hv_iterinit(data_hv);
1969                HE *restrict entry;
1970
52
                while((entry = hv_iternext(data_hv))) {
1971
39
                        AV *restrict arr = (AV*)SvRV(hv_iterval(data_hv, entry));
1972
39
                        size_t len = av_len(arr) + 1;
1973
39
                        if (len > max_rows) max_rows = len;
1974                }
1975
1976
16
                if (col_names_sv && SvOK(col_names_sv)) {
1977
3
                        AV *restrict c_av = (AV*)SvRV(col_names_sv);
1978
12
                        for(size_t i=0; i<=av_len(c_av); i++) {
1979
9
                                 SV **restrict c = av_fetch(c_av, i, 0);
1980
9
                                 if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c));
1981                        }
1982                } else {
1983
10
                        unsigned int num_cols = hv_iterinit(data_hv);
1984
10
                        const char **restrict col_array = safemalloc(num_cols * sizeof(char*));
1985
40
                        for(unsigned int i=0; i<num_cols; i++) {
1986
30
                                 HE *restrict ce = hv_iternext(data_hv);
1987
30
                                 col_array[i] = SvPV_nolen(hv_iterkeysv(ce));
1988                        }
1989
10
                        qsort(col_array, num_cols, sizeof(char*), cmp_string_wt);
1990
40
                        for(unsigned i=0; i<num_cols; i++) av_push(headers_av, newSVpv(col_array[i], 0));
1991
10
                        safefree(col_array);
1992                }
1993
13
                if (av_len(headers_av) < 0) croak("Could not get headers in write_table");
1994
13
                if (inc_rownames && contains_nondigit(row_names_sv)) {
1995
0
                        rownames_col = SvPV_nolen(row_names_sv);
1996
0
                        AV *restrict filtered_headers = (AV*)sv_2mortal((SV*)newAV());
1997
1998
0
                        for(size_t i=0; i<=av_len(headers_av); i++) {
1999
0
                                 SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2000
0
                                 if (!h_ptr || !*h_ptr) continue;
2001
0
                                 SV *restrict h_sv = *h_ptr;
2002
0
                                 if (strcmp(SvPV_nolen(h_sv), rownames_col) != 0) {
2003
0
                                     av_push(filtered_headers, newSVsv(h_sv));
2004                                 }
2005                        }
2006
0
                        SvREFCNT_dec(headers_av);
2007
0
                        headers_av = filtered_headers;
2008                }
2009
13
                size_t num_headers = av_len(headers_av) + 1;
2010
13
                const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*));
2011
13
                size_t h_idx = 0;
2012
13
                if (inc_rownames) header_row[h_idx++] = "";
2013
52
                for(size_t i=0; i<num_headers; i++) {
2014
39
                        SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2015
39
                        header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2016                }
2017
13
                print_string_row(fh, header_row, h_idx, sep);
2018
13
                safefree(header_row);
2019
13
                const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*));
2020
78
                for(size_t i=0; i<max_rows; i++) {
2021
65
                        size_t d_idx = 0;
2022
65
                        if (inc_rownames) {
2023
53
                                 if (rownames_col) {
2024
0
                                     SV **restrict rn_arr_ptr = hv_fetch(data_hv, rownames_col, strlen(rownames_col), 0);
2025
0
                                     if (rn_arr_ptr && SvROK(*rn_arr_ptr)) {
2026
0
                                         AV *restrict rn_arr = (AV*)SvRV(*rn_arr_ptr);
2027
0
                                         SV **restrict rn_val_ptr = av_fetch(rn_arr, i, 0);
2028
0
                                         if (rn_val_ptr && SvOK(*rn_val_ptr)) {
2029
0
                                             if (SvROK(*rn_val_ptr)) {
2030
0
                                                 PerlIO_close(fh);
2031
0
                                                 safefree(row_data);
2032
0
                                                 if (headers_av) SvREFCNT_dec(headers_av);
2033
0
                                                 croak("write_table: Cannot write nested reference types to table\n");
2034                                             }
2035
0
                                             row_data[d_idx++] = SvPV_nolen(*rn_val_ptr);
2036                                         } else {
2037
0
                                             row_data[d_idx++] = "NA";
2038                                         }
2039                                     } else {
2040
0
                                         row_data[d_idx++] = "NA";
2041                                     }
2042                                 } else {
2043                                     char buf[32];
2044
53
                                     snprintf(buf, sizeof(buf), "%ld", (long)(i + 1));
2045
53
                                     row_data[d_idx++] = savepv(buf);
2046                                 }
2047                        }
2048
260
                        for(size_t j=0; j<num_headers; j++) {
2049
195
                                 SV**restrict h_ptr = av_fetch(headers_av, j, 0);
2050
195
                                 const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2051
195
                                 SV **restrict arr_ptr = hv_fetch(data_hv, col_name, strlen(col_name), 0);
2052
390
                                 if (arr_ptr && SvROK(*arr_ptr)) {
2053
195
                                     AV *restrict arr = (AV*)SvRV(*arr_ptr);
2054
195
                                     SV **restrict cell_ptr = av_fetch(arr, i, 0);
2055
195
                                     if (cell_ptr && SvOK(*cell_ptr)) {
2056
104
                                         if (SvROK(*cell_ptr)) {
2057
0
                                             PerlIO_close(fh);
2058
0
                                             safefree(row_data);
2059
0
                                             if (headers_av) SvREFCNT_dec(headers_av);
2060
0
                                             croak("write_table: Cannot write nested reference types to table\n");
2061                                         }
2062
104
                                         row_data[d_idx++] = SvPV_nolen(*cell_ptr);
2063                                     } else {
2064
91
                                         row_data[d_idx++] = "NA";
2065                                     }
2066                                 } else {
2067
0
                                     row_data[d_idx++] = "NA";
2068                                 }
2069                        }
2070
65
                        print_string_row(fh, row_data, d_idx, sep);
2071
65
                        if (inc_rownames && !rownames_col) safefree((char*)row_data[0]);
2072                }
2073
13
                safefree(row_data);
2074
9
        } else if (is_aoh) {// ----- Array of Hashes -----
2075
9
                AV *restrict data_av = (AV*)data_ref;
2076
9
                size_t num_rows = av_len(data_av) + 1;
2077
12
                if (col_names_sv && SvOK(col_names_sv)) {
2078
3
                        AV *restrict c_av = (AV*)SvRV(col_names_sv);
2079
9
                        for(size_t i=0; i<=av_len(c_av); i++) {
2080
6
                                 SV **restrict c = av_fetch(c_av, i, 0);
2081
6
                                 if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c));
2082                        }
2083                } else {
2084
6
                        HV *restrict col_map = newHV();
2085
21
                        for(size_t i=0; i<num_rows; i++) {
2086
15
                                 SV **restrict row_ptr = av_fetch(data_av, i, 0);
2087
15
                                 if (row_ptr && SvROK(*row_ptr)) {
2088
15
                                     HV *restrict row_hv = (HV*)SvRV(*row_ptr);
2089
15
                                     hv_iterinit(row_hv);
2090                                     HE *restrict entry;
2091
42
                                     while((entry = hv_iternext(row_hv))) {
2092
27
                                         hv_store_ent(col_map, hv_iterkeysv(entry), newSViv(1), 0);
2093                                     }
2094                                 }
2095                        }
2096
6
                        unsigned num_cols = hv_iterinit(col_map);
2097
6
                        const char **restrict col_array = safemalloc(num_cols * sizeof(char*));
2098
21
                        for(unsigned int i=0; i<num_cols; i++) {
2099
15
                                 HE *restrict ce = hv_iternext(col_map);
2100
15
                                 col_array[i] = SvPV_nolen(hv_iterkeysv(ce));
2101                        }
2102
6
                        qsort(col_array, num_cols, sizeof(char*), cmp_string_wt);
2103
21
                        for(unsigned int i=0; i<num_cols; i++) av_push(headers_av, newSVpv(col_array[i], 0));
2104
6
                        safefree(col_array);
2105
6
                        SvREFCNT_dec(col_map);
2106                }
2107
9
                if (inc_rownames && contains_nondigit(row_names_sv)) {
2108
0
                        rownames_col = SvPV_nolen(row_names_sv);
2109
0
                        AV *restrict filtered_headers = newAV();
2110
0
                        for(size_t i=0; i<=av_len(headers_av); i++) {
2111
0
                                 SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2112
0
                                 if (!h_ptr || !*h_ptr) continue;
2113
0
                                 SV *restrict h_sv = *h_ptr;
2114
0
                                 if (strcmp(SvPV_nolen(h_sv), rownames_col) != 0) {
2115
0
                                     av_push(filtered_headers, newSVsv(h_sv));
2116                                 }
2117                        }
2118
0
                        SvREFCNT_dec(headers_av);
2119
0
                        headers_av = filtered_headers;
2120                }
2121
9
                size_t num_headers = av_len(headers_av) + 1;
2122
9
                const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*));
2123
9
                size_t h_idx = 0;
2124
9
                if (inc_rownames) header_row[h_idx++] = "";
2125
30
                for(size_t i=0; i<num_headers; i++) {
2126
21
                        SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2127
21
                        header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2128                }
2129
9
                print_string_row(fh, header_row, h_idx, sep);
2130
9
                safefree(header_row);
2131
9
                const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*));
2132
30
                for(size_t i=0; i<num_rows; i++) {
2133
21
                        size_t d_idx = 0;
2134
21
                        SV **restrict row_ptr = av_fetch(data_av, i, 0);
2135
21
                        HV *restrict row_hv = (row_ptr && SvROK(*row_ptr)) ? (HV*)SvRV(*row_ptr) : NULL;
2136
21
                        if (inc_rownames) {
2137
9
                                if (rownames_col) {
2138
0
                                  SV **restrict rn_val_ptr = row_hv ? hv_fetch(row_hv, rownames_col, strlen(rownames_col), 0) : NULL;
2139
0
                                  if (rn_val_ptr && SvOK(*rn_val_ptr)) {
2140
0
                                                if (SvROK(*rn_val_ptr)) {
2141
0
                                                         PerlIO_close(fh);
2142
0
                                                                  safefree(row_data);
2143
0
                                                                  if (headers_av) SvREFCNT_dec(headers_av);
2144
0
                                                         croak("write_table: Cannot write nested reference types to table\n");
2145                                                }
2146
0
                                                row_data[d_idx++] = SvPV_nolen(*rn_val_ptr);
2147                                  } else {
2148
0
                                                row_data[d_idx++] = "NA";
2149                                  }
2150                                } else {
2151                                  char buf[32];
2152
9
                                  snprintf(buf, sizeof(buf), "%ld", (long)(i + 1));
2153
9
                                  row_data[d_idx++] = savepv(buf);
2154                                }
2155                        }
2156
2157
72
                        for(size_t j=0; j<num_headers; j++) {
2158
51
                                 SV**restrict h_ptr = av_fetch(headers_av, j, 0);
2159
51
                                 const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2160
51
                                 SV **restrict cell_ptr = row_hv ? hv_fetch(row_hv, col_name, strlen(col_name), 0) : NULL;
2161
51
                                 if (cell_ptr && SvOK(*cell_ptr)) {
2162
39
                                     if (SvROK(*cell_ptr)) {
2163
0
                                         PerlIO_close(fh);
2164
0
                         safefree(row_data);
2165
0
                         if (headers_av) SvREFCNT_dec(headers_av);
2166
0
                                         croak("write_table: Cannot write nested reference types to table\n");
2167                                     }
2168
39
                                     row_data[d_idx++] = SvPV_nolen(*cell_ptr);
2169                                 } else {
2170
12
                                     row_data[d_idx++] = "NA";
2171                                 }
2172                        }
2173
21
                        print_string_row(fh, row_data, d_idx, sep);
2174
21
                        if (inc_rownames && !rownames_col) safefree((char*)row_data[0]);
2175                }
2176
9
                safefree(row_data);
2177        }
2178
31
        if (headers_av) SvREFCNT_dec(headers_av);
2179
31
        if (rows_av) SvREFCNT_dec(rows_av);
2180
31
        PerlIO_close(fh);
2181
31
        XSRETURN_EMPTY;
2182}
2183
2184SV*
2185_parse_csv_file(char* file, const char* sep_str, const char* comment_str, SV* callback = &PL_sv_undef)
2186INIT:
2187        PerlIO *restrict fp;
2188
39
        AV *restrict data = NULL;
2189
39
        AV *restrict current_row = newAV();
2190
39
        SV *restrict field = newSVpvs("");
2191
39
        bool in_quotes = 0, post_quote = 0;
2192        size_t sep_len, comment_len;
2193        SV *restrict line_sv;
2194
39
        bool use_cb = 0;
2195CODE:
2196
39
        if (SvOK(callback) && SvROK(callback) && SvTYPE(SvRV(callback)) == SVt_PVCV) {
2197
39
                use_cb = 1;
2198        } else {
2199
0
                data = newAV();
2200        }
2201
39
        sep_len = sep_str ? strlen(sep_str) : 0;
2202
39
        comment_len = comment_str ? strlen(comment_str) : 0;
2203
2204
39
        fp = PerlIO_open(file, "r");
2205
39
        if (!fp) {
2206
0
                croak("Could not open file '%s'", file);
2207        }
2208
39
        line_sv = newSV_type(SVt_PV);
2209        // Read line by line using PerlIO
2210
11925
        while (sv_gets(line_sv, fp, 0) != NULL) {
2211
11886
                char *restrict line = SvPV_nolen(line_sv);
2212
11886
                size_t len = SvCUR(line_sv);
2213                // chomp \r\n (Handles Windows invisible \r natively)
2214
11886
                if (len > 0 && line[len-1] == '\n') {
2215
11880
                        len--;
2216
11880
                        if (len > 0 && line[len-1] == '\r') {
2217
11088
                                len--;
2218                        }
2219                }
2220
11886
                if (!in_quotes) {
2221                        // Skip completely empty lines (\h*[\r\n]+$ equivalent)
2222
11883
                        bool is_empty = 1;
2223
11886
                        for (size_t i = 0; i < len; i++) {
2224
11886
                                if (line[i] != ' ' && line[i] != '\t') { is_empty = 0; break; }
2225                        }
2226
11883
                        if (is_empty) continue;
2227
2228                        // Skip comments
2229
11883
                        if (comment_len > 0 && len >= comment_len && strncmp(line, comment_str, comment_len) == 0) {
2230
0
                                continue;
2231                        }
2232                }
2233                // --- CORE PARSING MACHINE ---
2234
880764
                for (size_t i = 0; i < len; i++) {
2235
868878
                        const char ch = line[i];
2236
868878
                        if (ch == '\r') continue;
2237
868875
                        if (ch == '"') {
2238
66975
                                if (in_quotes && (i + 1 < len) && line[i+1] == '"') {
2239
9
                                        sv_catpvn(field, "\"", 1);
2240
9
                                        i++; // Skip the escaped second quote
2241
66966
                                } else if (in_quotes) {
2242
33480
                                        in_quotes = 0;  // Close quotes
2243
33480
                                        post_quote = 1;
2244
33486
                                } else if (!post_quote) {
2245
33483
                                        in_quotes = 1; // Open quotes (only when not in post-quote state)
2246                                }
2247
801900
                        } else if (!in_quotes && sep_len > 0 && (len - i) >= sep_len && strncmp(line + i, sep_str, sep_len) == 0) {
2248
154845
                                av_push(current_row, newSVsv(field));
2249
154845
                                sv_setpvs(field, ""); // Reset for next field
2250
154845
                                i += sep_len - 1;     // Advance past multi-char separators
2251
154845
                                post_quote = 0;
2252                        } else {
2253
647055
                                sv_catpvn(field, &ch, 1);
2254                        }
2255                }
2256
11886
                if (in_quotes) {
2257                        // Line ended but quotes are still open! Append newline and fetch next
2258
6
                        sv_catpvn(field, "\n", 1);
2259                } else {
2260
11880
                        post_quote = 0; // Reset post-quote state at row boundary
2261                        // Push the final field of the record
2262
11880
                        av_push(current_row, newSVsv(field));
2263
11880
                        sv_setpvs(field, "");
2264                        // If a callback is provided, invoke it in a streaming fashion
2265
11880
                        if (use_cb) {
2266
11880
                                dSP;
2267
11880
                                ENTER;
2268
11880
                                SAVETMPS;
2269
11880
                                PUSHMARK(SP);
2270
11880
                                XPUSHs(sv_2mortal(newRV_inc((SV*)current_row)));
2271
11880
                                PUTBACK;
2272
11880
                                call_sv(callback, G_DISCARD);
2273
11880
                                FREETMPS;
2274
11880
                                LEAVE;
2275
11880
                                SvREFCNT_dec(current_row); // Frees the row from C memory if Perl didn't keep it
2276                        } else {
2277
0
                                av_push(data, newRV_noinc((SV*)current_row));
2278                        }
2279
11880
                        current_row = newAV();
2280                }
2281        }
2282
39
        PerlIO_close(fp);
2283
39
        SvREFCNT_dec(line_sv);
2284
2285
39
        if (in_quotes) {
2286
3
                av_push(current_row, newSVsv(field));
2287
3
                if (use_cb) {
2288
3
                        dSP;
2289
3
                        ENTER;
2290
3
                        SAVETMPS;
2291
3
                        PUSHMARK(SP);
2292
3
                        XPUSHs(sv_2mortal(newRV_inc((SV*)current_row)));
2293
3
                        PUTBACK;
2294
3
                        call_sv(callback, G_DISCARD);
2295
3
                        FREETMPS;
2296
3
                        LEAVE;
2297
3
                        SvREFCNT_dec(current_row);
2298                } else {
2299
0
                        av_push(data, newRV_noinc((SV*)current_row));
2300                }
2301
3
                current_row = newAV();
2302        }
2303
39
        SvREFCNT_dec(field);
2304
39
        SvREFCNT_dec(current_row);
2305
2306
39
        if (use_cb) {
2307
39
                RETVAL = &PL_sv_undef; // Memory was fully handled by callback stream
2308        } else {
2309
0
                RETVAL = newRV_noinc((SV*)data);
2310        }
2311OUTPUT:
2312        RETVAL
2313
2314SV* cov(SV* x_sv, SV* y_sv, const char* method = "pearson")
2315        CODE:
2316        {
2317                // 1. Validate inputs are Array References
2318
12
                if (!SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) {
2319
0
                        croak("cov: first argument 'x' must be an ARRAY reference");
2320                }
2321
12
                if (!SvROK(y_sv) || SvTYPE(SvRV(y_sv)) != SVt_PVAV) {
2322
0
                        croak("cov: second argument 'y' must be an ARRAY reference");
2323                }
2324
2325                // 2. Validate method argument
2326
12
                if (strcmp(method, "pearson") != 0 &&
2327
6
                        strcmp(method, "spearman") != 0 &&
2328
3
                        strcmp(method, "kendall") != 0) {
2329
0
                        croak("cov: unknown method '%s' (use 'pearson', 'spearman', or 'kendall')", method);
2330                }
2331
2332
12
                AV *restrict x_av = (AV*)SvRV(x_sv);
2333
12
                AV *restrict y_av = (AV*)SvRV(y_sv);
2334
12
                size_t nx = av_len(x_av) + 1;
2335
12
                size_t ny = av_len(y_av) + 1;
2336
2337
12
                if (nx != ny) {
2338
0
                        croak("cov: incompatible dimensions (x has %lu, y has %lu)",
2339                                   (unsigned long)nx, (unsigned long)ny);
2340                }
2341
2342                // 3. Extract Valid Pairwise Data
2343                // Allocate temporary C arrays for numeric processing
2344
12
                double *restrict x_val = (double*)safemalloc(nx * sizeof(double));
2345
12
                double *restrict y_val = (double*)safemalloc(nx * sizeof(double));
2346
12
                size_t n = 0;
2347
2348
72
                for (size_t i = 0; i < nx; i++) {
2349
60
                        SV **restrict x_tv = av_fetch(x_av, i, 0);
2350
60
                        SV **restrict y_tv = av_fetch(y_av, i, 0);
2351
2352                        // Extract numeric values, defaulting to NAN for missing/invalid data
2353
60
                        double xv = (x_tv && SvOK(*x_tv) && looks_like_number(*x_tv)) ? SvNV(*x_tv) : NAN;
2354
60
                        double yv = (y_tv && SvOK(*y_tv) && looks_like_number(*y_tv)) ? SvNV(*y_tv) : NAN;
2355
2356                        // Pairwise complete observations (skips NAs seamlessly like R)
2357
60
                        if (!isnan(xv) && !isnan(yv)) {
2358
60
                                 x_val[n] = xv;
2359
60
                                 y_val[n] = yv;
2360
60
                                 n++;
2361                        }
2362                }
2363
2364                // 4. Handle edge cases where data is too sparse
2365
12
                if (n < 2) {
2366
0
                        Safefree(x_val);
2367
0
                        Safefree(y_val);
2368
0
                        RETVAL = newSVnv(NAN);
2369                } else {
2370
12
                        double ans = 0.0;
2371
2372                        // 5. Algorithm routing
2373
12
                        if (strcmp(method, "kendall") == 0) {
2374                                 // R's default cov(..., method="kendall") iterates the full n x n space
2375
18
                                 for (size_t i = 0; i < n; i++) {
2376
90
                                     for (size_t j = 0; j < n; j++) {
2377
75
                                         int sx = (x_val[i] > x_val[j]) - (x_val[i] < x_val[j]);
2378
75
                                         int sy = (y_val[i] > y_val[j]) - (y_val[i] < y_val[j]);
2379
75
                                         ans += (double)(sx * sy);
2380                                     }
2381                                 }
2382                        } else {
2383
9
                                 double mean_x = 0.0, mean_y = 0.0, cov_sum = 0.0;
2384
2385
9
                                 if (strcmp(method, "spearman") == 0) {
2386                                     // Spearman: Rank the data first, then run standard covariance
2387
3
                                     double *restrict rx = (double*)safemalloc(n * sizeof(double));
2388
3
                                     double *restrict ry = (double*)safemalloc(n * sizeof(double));
2389
2390                                     // Uses your existing rank_data() helper from LikeR.xs
2391
3
                                     rank_data(x_val, rx, n);
2392
3
                                     rank_data(y_val, ry, n);
2393
2394
18
                                     for (size_t i = 0; i < n; i++) {
2395
15
                                         double dx = rx[i] - mean_x;
2396
15
                                         mean_x += dx / (i + 1);
2397
15
                                         double dy = ry[i] - mean_y;
2398
15
                                         mean_y += dy / (i + 1);
2399
15
                                         cov_sum += dx * (ry[i] - mean_y);
2400                                     }
2401
2402
3
                                     Safefree(rx);
2403
3
                                     Safefree(ry);
2404                                 } else {
2405                                     // Pearson: Welford's Single-Pass Covariance Algorithm
2406
36
                                     for (size_t i = 0; i < n; i++) {
2407
30
                                         double dx = x_val[i] - mean_x;
2408
30
                                         mean_x += dx / (i + 1);
2409
30
                                         double dy = y_val[i] - mean_y;
2410
30
                                         mean_y += dy / (i + 1);
2411
30
                                         cov_sum += dx * (y_val[i] - mean_y);
2412                                     }
2413                                 }
2414
2415                                 // Unbiased Sample Covariance (N - 1) for Pearson & Spearman
2416
9
                                 ans = cov_sum / (n - 1);
2417                        }
2418
2419
12
                        Safefree(x_val);
2420
12
                        Safefree(y_val);
2421
12
                        RETVAL = newSVnv(ans);
2422                }
2423        }
2424        OUTPUT:
2425          RETVAL
2426
2427SV* glm(...)
2428CODE:
2429{
2430
21
        const char *restrict formula  = NULL;
2431
21
        SV *restrict data_sv = NULL;
2432
21
        const char *restrict family_str = "gaussian";
2433        char f_cpy[512];
2434        char *restrict src, *restrict dst, *restrict tilde, *restrict lhs, *restrict rhs, *restrict chunk;
2435
2436        /* Dynamic Term Arrays */
2437
21
        char **restrict terms = NULL, **restrict uniq_terms = NULL, **restrict exp_terms = NULL;
2438
21
        bool *restrict is_dummy = NULL;
2439
21
        char **restrict dummy_base = NULL, **restrict dummy_level = NULL;
2440
21
        unsigned int term_cap = 64, exp_cap = 64, num_terms = 0, num_uniq = 0, p = 0, p_exp = 0;
2441
21
        size_t n = 0, valid_n = 0, i;
2442
21
        bool has_intercept = true, converged = false, boundary = false;
2443
21
        int iter = 0, max_iter = 25, final_rank = 0, df_res = 0;
2444
21
        double deviance_old = 0.0, deviance_new = 0.0, null_dev = 0.0, aic = 0.0;
2445
21
        double dispersion = 0.0, epsilon = 1e-8;
2446
2447
21
        char **restrict row_names = NULL;
2448
21
        char **restrict valid_row_names = NULL;
2449
21
        HV **restrict row_hashes = NULL;
2450
21
        HV *restrict data_hoa = NULL;
2451
21
        SV *restrict ref = NULL;
2452
2453
21
        double *restrict X = NULL, *restrict Y = NULL, *restrict mu = NULL, *restrict eta = NULL;
2454
21
        double *restrict W = NULL, *restrict Z = NULL, *restrict beta = NULL, *restrict beta_old = NULL;
2455
21
        bool *restrict aliased = NULL;
2456
21
        double *restrict XtWX = NULL, *restrict XtWZ = NULL;
2457
2458        HV *restrict res_hv, *restrict coef_hv, *restrict fitted_hv, *restrict resid_hv, *restrict summary_hv;
2459        AV *restrict terms_av;
2460        HE *restrict entry;
2461
2462
21
        if (items % 2 != 0) croak("Usage: glm(formula => 'am ~ wt + hp', data => \\%mtcars)");
2463
2464
78
        for (unsigned short i_arg = 0; i_arg < items; i_arg += 2) {
2465
57
          const char *restrict key = SvPV_nolen(ST(i_arg));
2466
57
          SV *restrict val = ST(i_arg + 1);
2467
57
          if      (strEQ(key, "formula")) formula = SvPV_nolen(val);
2468
36
          else if (strEQ(key, "data"))    data_sv = val;
2469
15
          else if (strEQ(key, "family"))  family_str = SvPV_nolen(val);
2470
0
          else croak("glm: unknown argument '%s'", key);
2471        }        
2472
21
        if (!formula) croak("glm: formula is required");
2473
21
        if (!data_sv || !SvROK(data_sv)) croak("glm: data is required and must be a reference");
2474
2475
21
        bool is_binomial = (strcmp(family_str, "binomial") == 0);
2476
21
        bool is_gaussian = (strcmp(family_str, "gaussian") == 0);
2477
21
        if (!is_binomial && !is_gaussian) croak("glm: unsupported family '%s'", family_str);
2478
2479        // --- Formula Parsing & Expansion ---
2480
21
        Newx(terms, term_cap, char*); Newx(uniq_terms, term_cap, char*);
2481
21
        Newx(exp_terms, exp_cap, char*); Newx(is_dummy, exp_cap, bool);
2482
21
        Newx(dummy_base, exp_cap, char*); Newx(dummy_level, exp_cap, char*);
2483
2484
21
        src = (char*)formula; dst = f_cpy;
2485
294
        while (*src && (dst - f_cpy < 511)) { if (!isspace(*src)) { *dst++ = *src; } src++; }
2486
21
        *dst = '\0';
2487
2488
21
        tilde = strchr(f_cpy, '~');
2489
21
        if (!tilde) croak("glm: invalid formula, missing '~'");
2490
21
        *tilde = '\0';
2491
21
        lhs = f_cpy; rhs = tilde + 1;
2492
2493
21
        if (strstr(rhs, "-1")) has_intercept = false;
2494
21
        if (has_intercept) terms[num_terms++] = savepv("Intercept");
2495
2496
21
        chunk = strtok(rhs, "+");
2497
60
        while (chunk != NULL) {
2498
39
          if (num_terms >= term_cap - 3) {
2499
0
                   term_cap *= 2;
2500
0
                   Renew(terms, term_cap, char*); Renew(uniq_terms, term_cap, char*);
2501          }
2502
39
          if (strcmp(chunk, "1") == 0 || strcmp(chunk, "-1") == 0) {
2503
0
                   chunk = strtok(NULL, "+");
2504
0
                   continue;
2505          }
2506
39
          char *restrict star = strchr(chunk, '*');
2507
39
          if (star) {
2508
0
                   *star = '\0';
2509
0
                   char *restrict left = chunk; char *restrict right = star + 1;
2510
0
                   char *restrict c_l = strchr(left, '^'); if (c_l && strncmp(left, "I(", 2) != 0) *c_l = '\0';
2511
0
                   char *restrict c_r = strchr(right, '^'); if (c_r && strncmp(right, "I(", 2) != 0) *c_r = '\0';
2512
2513
0
                   terms[num_terms++] = savepv(left);
2514
0
                   terms[num_terms++] = savepv(right);
2515
0
                   size_t inter_len = strlen(left) + strlen(right) + 2;
2516
0
                   terms[num_terms] = (char*)safemalloc(inter_len);
2517
0
                   snprintf(terms[num_terms++], inter_len, "%s:%s", left, right);
2518          } else {
2519
39
                   char *restrict c_chunk = strchr(chunk, '^');
2520
39
                   if (c_chunk && strncmp(chunk, "I(", 2) != 0) *c_chunk = '\0';
2521
39
                   terms[num_terms++] = savepv(chunk);
2522          }
2523
39
          chunk = strtok(NULL, "+");
2524        }
2525
2526
81
        for (i = 0; i < num_terms; i++) {
2527
60
          bool found = false;
2528
117
          for (size_t j = 0; j < num_uniq; j++) {
2529
57
                   if (strcmp(terms[i], uniq_terms[j]) == 0) { found = true; break; }
2530          }
2531
60
          if (!found) uniq_terms[num_uniq++] = savepv(terms[i]);
2532        }
2533
21
        p = num_uniq;
2534
2535        // --- Data Extraction ---
2536
21
        ref = SvRV(data_sv);
2537
21
        if (SvTYPE(ref) == SVt_PVHV) {
2538
21
                HV*restrict hv = (HV*)ref;
2539
21
                if (hv_iterinit(hv) == 0) croak("glm: Data hash is empty");
2540
21
                entry = hv_iternext(hv);
2541
21
                if (entry) {
2542
21
                        SV*restrict val = hv_iterval(hv, entry);
2543
21
                        if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) {
2544
6
                                 data_hoa = hv;
2545
6
                                 n = av_len((AV*)SvRV(val)) + 1;
2546
6
                                 Newx(row_names, n, char*);
2547
366
                                 for(i = 0; i < n; i++) {
2548
360
                                     char buf[32]; snprintf(buf, sizeof(buf), "%lu", i+1);
2549
360
                                     row_names[i] = savepv(buf);
2550                                 }
2551
15
                        } else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) {
2552
15
                                 n = hv_iterinit(hv);
2553
15
                                 Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
2554
15
                                 i = 0;
2555
495
                                 while ((entry = hv_iternext(hv))) {
2556                                     unsigned int len;
2557
480
                                     row_names[i] = savepv(hv_iterkey(entry, &len));
2558
480
                                     row_hashes[i] = (HV*)SvRV(hv_iterval(hv, entry));
2559
480
                                     i++;
2560                                 }
2561
0
                        } else croak("glm: Hash values must be ArrayRefs (HoA) or HashRefs (HoH)");
2562                }
2563
0
        } else if (SvTYPE(ref) == SVt_PVAV) {
2564
0
          AV*restrict av = (AV*)ref;
2565
0
          n = av_len(av) + 1;
2566
0
          Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
2567
0
          for (i = 0; i < n; i++) {
2568
0
                   SV**restrict val = av_fetch(av, i, 0);
2569
0
                   if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVHV) {
2570
0
                       row_hashes[i] = (HV*)SvRV(*val);
2571
0
                       char buf[32]; snprintf(buf, sizeof(buf), "%lu", i + 1);
2572
0
                       row_names[i] = savepv(buf);
2573                   } else {
2574
0
                       for (size_t k = 0; k < i; k++) Safefree(row_names[k]);
2575
0
                       Safefree(row_names); Safefree(row_hashes);
2576
0
                       croak("glm: Array values must be HashRefs (AoH)");
2577                   }
2578          }
2579
0
        } else croak("glm: Data must be an Array or Hash reference");
2580
2581        // --- Categorical Expansion ---
2582
81
        for (size_t j = 0; j < p; j++) {
2583
60
          if (p_exp + 32 >= exp_cap) {
2584
0
                   exp_cap *= 2;
2585
0
                   Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
2586
0
                   Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
2587          }
2588
60
          if (strcmp(uniq_terms[j], "Intercept") == 0) {
2589
21
                   exp_terms[p_exp] = savepv("Intercept"); is_dummy[p_exp] = false; p_exp++; continue;
2590          }
2591
39
          if (is_column_categorical(data_hoa, row_hashes, n, uniq_terms[j])) {
2592
3
                   char **restrict levels = NULL; size_t num_levels = 0, levels_cap = 8;
2593
3
                   Newx(levels, levels_cap, char*);
2594
183
                   for (i = 0; i < n; i++) {
2595
180
                       char*restrict str_val = get_data_string_alloc(data_hoa, row_hashes, i, uniq_terms[j]);
2596
180
                       if (str_val) {
2597
180
                           bool found = false;
2598
270
                           for (size_t l = 0; l < num_levels; l++) {
2599
264
                               if (strcmp(levels[l], str_val) == 0) { found = true; break; }
2600                           }
2601
180
                           if (!found) {
2602
6
                               if (num_levels >= levels_cap) { levels_cap *= 2; Renew(levels, levels_cap, char*); }
2603
6
                               levels[num_levels++] = savepv(str_val);
2604                           }
2605
180
                           Safefree(str_val);
2606                       }
2607                   }
2608
3
                   if (num_levels > 0) {
2609
6
                       for (size_t l1 = 0; l1 < num_levels - 1; l1++) {
2610
6
                           for (size_t l2 = l1 + 1; l2 < num_levels; l2++) {
2611
3
                               if (strcmp(levels[l1], levels[l2]) > 0) {
2612
3
                                   char *tmp = levels[l1]; levels[l1] = levels[l2]; levels[l2] = tmp;
2613                               }
2614                           }
2615                       }
2616
6
                       for (size_t l = 1; l < num_levels; l++) {
2617
3
                           if (p_exp >= exp_cap) {
2618
0
                               exp_cap *= 2;
2619
0
                               Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
2620
0
                               Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
2621                           }
2622
3
                           size_t t_len = strlen(uniq_terms[j]) + strlen(levels[l]) + 1;
2623
3
                           exp_terms[p_exp] = (char*)safemalloc(t_len);
2624
3
                           snprintf(exp_terms[p_exp], t_len, "%s%s", uniq_terms[j], levels[l]);
2625
3
                           is_dummy[p_exp] = true; dummy_base[p_exp] = savepv(uniq_terms[j]); dummy_level[p_exp] = savepv(levels[l]);
2626
3
                           p_exp++;
2627                       }
2628
9
                       for (size_t l = 0; l < num_levels; l++) Safefree(levels[l]);
2629
3
                       Safefree(levels);
2630                   } else {
2631
0
                       Safefree(levels); exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = false; p_exp++;
2632                   }
2633          } else {
2634
36
                   exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = false; p_exp++;
2635          }
2636        }
2637
21
        p = p_exp;
2638
2639
21
        Newx(X, n * p, double); Newx(Y, n, double);
2640
21
        Newx(valid_row_names, n, char*);
2641
2642        // --- Listwise Deletion ---
2643
861
        for (size_t i = 0; i < n; i++) {
2644
840
                double y_val = evaluate_term(data_hoa, row_hashes, i, lhs);
2645
840
                if (isnan(y_val)) { Safefree(row_names[i]); continue; }
2646
2647
840
                bool row_ok = true;
2648
840
                double *restrict row_x = (double*)safemalloc(p * sizeof(double));
2649
3180
                for (size_t j = 0; j < p; j++) {
2650
2340
                        if (strcmp(exp_terms[j], "Intercept") == 0) {
2651
840
                                 row_x[j] = 1.0;
2652
1500
                        } else if (is_dummy[j]) {
2653
180
                                 char* str_val = get_data_string_alloc(data_hoa, row_hashes, i, dummy_base[j]);
2654
180
                                 if (str_val) {
2655
180
                                     row_x[j] = (strcmp(str_val, dummy_level[j]) == 0) ? 1.0 : 0.0;
2656
180
                                     Safefree(str_val);
2657
0
                                 } else { row_ok = false; break; }
2658                        } else {
2659
1320
                                 row_x[j] = evaluate_term(data_hoa, row_hashes, i, exp_terms[j]);
2660
1320
                                 if (isnan(row_x[j])) { row_ok = false; break; }
2661                        }
2662                }
2663
840
                if (!row_ok) { Safefree(row_names[i]); Safefree(row_x); continue; }
2664
840
                Y[valid_n] = y_val;
2665
3180
                for (size_t j = 0; j < p; j++) X[valid_n * p + j] = row_x[j];
2666
840
                valid_row_names[valid_n] = row_names[i];
2667
840
                valid_n++;
2668
840
                Safefree(row_x);
2669        }
2670
21
        Safefree(row_names);
2671
21
        if (valid_n <= p) {
2672
0
          Safefree(X); Safefree(Y); Safefree(valid_row_names); if (row_hashes) Safefree(row_hashes);
2673
0
          croak("glm: 0 degrees of freedom (too many NAs or parameters > observations)");
2674        }
2675        // --- R glm.fit IRLS Implementation ---
2676
21
        mu = (double*)safemalloc(valid_n * sizeof(double)); eta = (double*)safemalloc(valid_n * sizeof(double));
2677
21
        W = (double*)safemalloc(valid_n * sizeof(double)); Z = (double*)safemalloc(valid_n * sizeof(double));
2678
21
        beta = (double*)safemalloc(p * sizeof(double)); beta_old = (double*)safemalloc(p * sizeof(double));
2679
21
        aliased = (bool*)safemalloc(p * sizeof(bool));
2680
21
        XtWX = (double*)safemalloc(p * p * sizeof(double)); XtWZ = (double*)safemalloc(p * sizeof(double));
2681
81
        for (i = 0; i < p; i++) { beta[i] = 0.0; beta_old[i] = 0.0; }
2682        // Initialize (mustart / etastart equivalent)
2683
21
        double sum_y = 0.0;
2684
861
        for (i = 0; i < valid_n; i++) sum_y += Y[i];
2685
21
        double mean_y = sum_y / valid_n;
2686
861
        for (i = 0; i < valid_n; i++) {
2687
840
          if (is_binomial) {
2688
96
                   if (Y[i] < 0.0 || Y[i] > 1.0) croak("glm: binomial family requires response between 0 and 1");
2689
96
                   mu[i] = (Y[i] + 0.5) / 2.0;
2690
96
                   eta[i] = log(mu[i] / (1.0 - mu[i]));
2691
96
                   double dev = 0.0;
2692
96
                   if (Y[i] == 0.0)      dev = -2.0 * log(1.0 - mu[i]);
2693
39
                   else if (Y[i] == 1.0) dev = -2.0 * log(mu[i]);
2694
0
                   else dev = 2.0 * (Y[i] * log(Y[i] / mu[i]) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - mu[i])));
2695
96
                   deviance_old += dev;
2696          } else {
2697
744
                   mu[i] = mean_y; // R gaussian init
2698
744
                   eta[i] = mu[i];
2699          }
2700        }
2701        // IRLS Loop
2702
63
        for (iter = 1; iter <= max_iter; iter++) {
2703
2415
                for (i = 0; i < valid_n; i++) {
2704
2352
                        if (is_binomial) {
2705
864
                                 double varmu = mu[i] * (1.0 - mu[i]);
2706
864
                                 double mu_eta = varmu; // Link derivative for logit
2707
864
                                 if (varmu < 1e-10) varmu = 1e-10;
2708
864
                                 Z[i] = eta[i] + (Y[i] - mu[i]) / mu_eta;
2709
864
                                 W[i] = (mu_eta * mu_eta) / varmu;
2710                        } else {
2711
1488
                                 W[i] = 1.0;
2712
1488
                                 Z[i] = Y[i];
2713                        }
2714                }
2715                // Formulate XtWX and XtWZ
2716
783
                for (i = 0; i < p; i++) { XtWZ[i] = 0.0; for (size_t j = 0; j < p; j++) XtWX[i * p + j] = 0.0; }
2717
2415
                for (size_t k = 0; k < valid_n; k++) {
2718
2352
                        double w = W[k], z = Z[k];
2719
9048
                        for (i = 0; i < p; i++) {
2720
6696
                                 XtWZ[i] += X[k * p + i] * w * z;
2721
6696
                                 double xw = X[k * p + i] * w;
2722
26064
                                 for (size_t j = 0; j < p; j++) XtWX[i * p + j] += xw * X[k * p + j];
2723                        }
2724                }
2725
63
                final_rank = sweep_matrix_ols(XtWX, p, aliased);
2726
246
                for (i = 0; i < p; i++) {
2727
183
                        if (aliased[i]) { beta[i] = NAN; } else {
2728
183
                                 double sum = 0.0;
2729
720
                                 for (size_t j = 0; j < p; j++) if (!aliased[j]) sum += XtWX[i * p + j] * XtWZ[j];
2730
183
                                 beta[i] = sum;
2731                        }
2732                }
2733                // Calculate updated ETA, MU, and Deviance (with Step-Halving)
2734
63
                boundary = false;
2735
693
                for (unsigned short int half = 0; half < 10; half++) {
2736
630
                        deviance_new = 0.0;
2737
24150
                        for (i = 0; i < valid_n; i++) {
2738
23520
                                 double linear_pred = 0.0;
2739
90480
                                 for (size_t j = 0; j < p; j++) if (!aliased[j]) linear_pred += X[i * p + j] * beta[j];
2740
23520
                                 eta[i] = linear_pred;
2741
23520
                                 if (is_binomial) {
2742
8640
                                     mu[i] = 1.0 / (1.0 + exp(-eta[i]));
2743                                     // Boundary enforcement
2744
8640
                                     if (mu[i] < 10 * DBL_EPSILON) mu[i] = 10 * DBL_EPSILON;
2745
8640
                                     if (mu[i] > 1.0 - 10 * DBL_EPSILON) mu[i] = 1.0 - 10 * DBL_EPSILON;
2746
2747
8640
                                     double dev = 0.0;
2748
8640
                                     if (Y[i] == 0.0)      dev = -2.0 * log(1.0 - mu[i]);
2749
3510
                                     else if (Y[i] == 1.0) dev = -2.0 * log(mu[i]);
2750
0
                                     else dev = 2.0 * (Y[i] * log(Y[i] / mu[i]) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - mu[i])));
2751
8640
                                     deviance_new += dev;
2752                                 } else {
2753
14880
                                     mu[i] = eta[i];
2754
14880
                                     double res = Y[i] - mu[i];
2755
14880
                                     deviance_new += res * res;
2756                                 }
2757                        }
2758                        // Step halving divergence check
2759
630
                        if (!is_binomial || deviance_new <= deviance_old + 1e-7 || !isfinite(deviance_new)) {
2760
600
                                 continue;
2761                        }
2762
2763
30
                        boundary = true;
2764
120
                        for (size_t j = 0; j < p; j++) beta[j] = (beta[j] + beta_old[j]) / 2.0;
2765                }
2766                // Convergence Check
2767
63
                if (fabs(deviance_new - deviance_old) / (0.1 + fabs(deviance_new)) < epsilon) {
2768
21
                        converged = true; break;
2769                }
2770
42
                deviance_old = deviance_new;
2771
165
                for (size_t j = 0; j < p; j++) beta_old[j] = beta[j];
2772        }
2773        // Final accurate calculation of W for standard errors
2774
255
        for (i = 0; i < p; i++) { for (size_t j = 0; j < p; j++) XtWX[i * p + j] = 0.0; }
2775
861
        for (size_t k = 0; k < valid_n; k++) {
2776
840
          double w = is_binomial ? (mu[k] * (1.0 - mu[k])) : 1.0;
2777
840
          if (w < 1e-10) w = 1e-10;
2778
3180
          for (i = 0; i < p; i++) {
2779
2340
                   double xw = X[k * p + i] * w;
2780
9000
                   for (size_t j = 0; j < p; j++) XtWX[i * p + j] += xw * X[k * p + j];
2781          }
2782        }
2783
21
        final_rank = sweep_matrix_ols(XtWX, p, aliased);
2784        // --- Null Deviance Calculation ---
2785
21
        double wtdmu = mean_y; // Since weights are 1.0 initially
2786
861
        for (i = 0; i < valid_n; i++) {
2787
840
          if (is_binomial) {
2788
96
                   if (Y[i] == 0.0)      null_dev += -2.0 * log(1.0 - wtdmu);
2789
39
                   else if (Y[i] == 1.0) null_dev += -2.0 * log(wtdmu);
2790
0
                   else null_dev += 2.0 * (Y[i] * log(Y[i] / wtdmu) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - wtdmu)));
2791          } else {
2792
744
                   double diff = Y[i] - wtdmu;
2793
744
                   null_dev += diff * diff;
2794          }
2795        }
2796        // --- AIC Calculation ---
2797
21
        if (is_gaussian) {
2798
18
          double n_f = (double)valid_n;
2799
18
          aic = n_f * (log(2.0 * M_PI) + 1.0 + log(deviance_new / n_f)) + 2.0 * (final_rank + 1.0);
2800
3
        } else if (is_binomial) {
2801
3
          aic = deviance_new + 2.0 * final_rank;
2802        }
2803        // --- Return Structures ---
2804
21
        res_hv = newHV(); coef_hv = newHV(); fitted_hv = newHV(); resid_hv = newHV();
2805
21
        df_res = valid_n - final_rank;
2806
21
        dispersion = is_binomial ? 1.0 : ((df_res > 0) ? (deviance_new / df_res) : NAN);
2807
861
        for (size_t i = 0; i < valid_n; i++) {
2808
840
                double res = Y[i] - mu[i];
2809
840
                if (is_binomial) {
2810                        // Deviance residuals for binomial
2811
96
                        double d_res = 0.0;
2812
96
                        if (Y[i] == 0.0)      d_res = sqrt(-2.0 * log(1.0 - mu[i]));
2813
39
                        else if (Y[i] == 1.0) d_res = sqrt(-2.0 * log(mu[i]));
2814
0
                        else d_res = sqrt(2.0 * (Y[i] * log(Y[i] / mu[i]) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - mu[i]))));
2815
96
                        res = (Y[i] > mu[i]) ? d_res : -d_res;
2816                }
2817
840
                hv_store(fitted_hv, valid_row_names[i], strlen(valid_row_names[i]), newSVnv(mu[i]), 0);
2818
840
                hv_store(resid_hv,  valid_row_names[i], strlen(valid_row_names[i]), newSVnv(res), 0);
2819
840
                Safefree(valid_row_names[i]);
2820        }
2821
21
        Safefree(valid_row_names);
2822
2823
21
        summary_hv = newHV(); terms_av = newAV();
2824
81
        for (size_t j = 0; j < p; j++) {
2825
60
                hv_store(coef_hv, exp_terms[j], strlen(exp_terms[j]), newSVnv(beta[j]), 0);
2826
60
                av_push(terms_av, newSVpv(exp_terms[j], 0));
2827
2828
60
                HV *restrict row_hv = newHV();
2829
60
                if (aliased[j]) {
2830
0
                        hv_store(row_hv, "Estimate",   8, newSVpv("NaN", 0), 0);
2831
0
                        hv_store(row_hv, "Std. Error", 10, newSVpv("NaN", 0), 0);
2832
0
                        hv_store(row_hv, is_binomial ? "z value" : "t value", 7, newSVpv("NaN", 0), 0);
2833
0
                        hv_store(row_hv, is_binomial ? "Pr(>|z|)" : "Pr(>|t|)", 8, newSVpv("NaN", 0), 0);
2834                } else {
2835
60
                        double se = sqrt(dispersion * XtWX[j * p + j]);
2836
60
                        double val_stat = beta[j] / se;
2837
60
                        double p_val = is_binomial ? 2.0 * (1.0 - approx_pnorm(fabs(val_stat))) : get_t_pvalue(val_stat, df_res, "two.sided");
2838
2839
60
                        hv_store(row_hv, "Estimate",   8, newSVnv(beta[j]), 0);
2840
60
                        hv_store(row_hv, "Std. Error", 10, newSVnv(se), 0);
2841
60
                        hv_store(row_hv, is_binomial ? "z value" : "t value", 7, newSVnv(val_stat), 0);
2842
60
                        hv_store(row_hv, is_binomial ? "Pr(>|z|)" : "Pr(>|t|)", 8, newSVnv(p_val), 0);
2843                }
2844
60
                hv_store(summary_hv, exp_terms[j], strlen(exp_terms[j]), newRV_noinc((SV*)row_hv), 0);
2845        }
2846
2847
21
        hv_store(res_hv, "aic",            3, newSVnv(aic), 0);
2848
21
        hv_store(res_hv, "coefficients",  12, newRV_noinc((SV*)coef_hv), 0);
2849
21
        hv_store(res_hv, "converged",      9, newSVuv(converged ? 1 : 0), 0);
2850
21
        hv_store(res_hv, "boundary",       8, newSVuv(boundary ? 1 : 0), 0);
2851
21
        hv_store(res_hv, "deviance",       8, newSVnv(deviance_new), 0);
2852
21
        hv_store(res_hv, "deviance.resid", 14, newRV_noinc((SV*)resid_hv), 0);
2853
21
        hv_store(res_hv, "df.null",        7, newSVuv(valid_n - has_intercept), 0);
2854
21
        hv_store(res_hv, "df.residual",   11, newSVuv(df_res), 0);
2855
21
        hv_store(res_hv, "family",         6, newSVpv(family_str, 0), 0);
2856
21
        hv_store(res_hv, "fitted.values", 13, newRV_noinc((SV*)fitted_hv), 0);
2857
21
        hv_store(res_hv, "iter",           4, newSVuv(iter > max_iter ? max_iter : iter), 0);
2858
21
        hv_store(res_hv, "null.deviance", 13, newSVnv(null_dev), 0);
2859
21
        hv_store(res_hv, "rank",           4, newSVuv(final_rank), 0);
2860
21
        hv_store(res_hv, "summary",        7, newRV_noinc((SV*)summary_hv), 0);
2861
21
        hv_store(res_hv, "terms",          5, newRV_noinc((SV*)terms_av), 0);
2862
2863        // --- Cleanup ---
2864
81
        for (i = 0; i < num_terms; i++) Safefree(terms[i]);
2865
21
        Safefree(terms);
2866
81
        for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]);
2867
21
        Safefree(uniq_terms);
2868
81
        for (size_t j = 0; j < p_exp; j++) {
2869
60
                Safefree(exp_terms[j]);
2870
60
                if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
2871        }
2872
21
        Safefree(exp_terms); Safefree(is_dummy); Safefree(dummy_base); Safefree(dummy_level);
2873
2874
21
        Safefree(mu); Safefree(eta); Safefree(Z); Safefree(W);
2875
21
        Safefree(beta); Safefree(beta_old); Safefree(aliased);
2876
21
        Safefree(XtWX); Safefree(XtWZ); Safefree(X); Safefree(Y);
2877
21
        if (row_hashes) Safefree(row_hashes);
2878
2879
21
        RETVAL = newRV_noinc((SV*)res_hv);
2880}
2881OUTPUT:
2882    RETVAL
2883
2884SV* cor_test(...)
2885CODE:
2886{
2887
21
        if (items < 2 || items % 2 != 0)
2888
0
                croak("Usage: cor_test(\\@x, \\@y, method => 'pearson', ...)");
2889
2890
21
        SV *restrict x_ref = ST(0), *restrict y_ref = ST(1);
2891
2892
21
        const char *restrict alternative = "two.sided";
2893
21
        const char *restrict method = "pearson";
2894
21
        SV *restrict exact_sv = NULL;
2895
21
        double conf_level = 0.95;
2896
21
        bool continuity = 0;
2897
2898        /* Parse named arguments from the flat stack starting at index 2 */
2899
105
        for (unsigned short int i = 2; i < items; i += 2) {
2900
84
          const char *restrict key = SvPV_nolen(ST(i));
2901
84
          SV *restrict val = ST(i + 1);
2902
2903
84
          if      (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
2904
63
          else if (strEQ(key, "method"))      method = SvPV_nolen(val);
2905
42
          else if (strEQ(key, "exact"))       exact_sv = val;
2906
42
          else if (strEQ(key, "conf.level") || strEQ(key, "conf_level")) conf_level = SvNV(val);
2907
21
          else if (strEQ(key, "continuity"))  continuity = SvTRUE(val);
2908
0
          else croak("cor_test: unknown argument '%s'", key);
2909        }
2910
2911        AV *restrict x_av, *restrict y_av;
2912        double *restrict x, *restrict y;
2913
21
        double estimate = 0, p_value = 0, statistic = 0, df = 0, ci_lower = 0, ci_upper = 0;
2914
2915
21
        bool is_pearson  = (strcmp(method, "pearson") == 0);
2916
21
        bool is_kendall  = (strcmp(method, "kendall") == 0);
2917
21
        bool is_spearman = (strcmp(method, "spearman") == 0);
2918        HV *restrict rhv;
2919
2920
21
        if (!SvOK(x_ref) || !SvROK(x_ref) || SvTYPE(SvRV(x_ref)) != SVt_PVAV ||
2921
21
            !SvOK(y_ref) || !SvROK(y_ref) || SvTYPE(SvRV(y_ref)) != SVt_PVAV) {
2922
0
          croak("cor_test: x and y must be array references");
2923        }
2924
2925
21
        x_av = (AV*)SvRV(x_ref);
2926
21
        y_av = (AV*)SvRV(y_ref);
2927
2928
21
        size_t n_raw = av_len(x_av) + 1;
2929
21
        if (n_raw != av_len(y_av) + 1) croak("incompatible dimensions");
2930
2931
21
        x = safemalloc(n_raw * sizeof(double));
2932
21
        y = safemalloc(n_raw * sizeof(double));
2933
2934
21
        size_t n = 0; /* Final count of pairwise complete observations */
2935
138
        for (size_t i = 0; i < n_raw; i++) {
2936
117
          SV **restrict x_val = av_fetch(x_av, i, 0);
2937
117
          SV **restrict y_val = av_fetch(y_av, i, 0);
2938
2939
117
          double xv = (x_val && SvOK(*x_val) && looks_like_number(*x_val)) ? SvNV(*x_val) : NAN;
2940
117
          double yv = (y_val && SvOK(*y_val) && looks_like_number(*y_val)) ? SvNV(*y_val) : NAN;
2941
2942          /* Pairwise complete observations (skips NAs seamlessly like R) */
2943
117
          if (!isnan(xv) && !isnan(yv)) {
2944
105
              x[n] = xv;
2945
105
              y[n] = yv;
2946
105
              n++;
2947          }
2948        }
2949
2950
21
        if (n < 3) {
2951
0
          Safefree(x);
2952
0
          Safefree(y);
2953
0
          croak("not enough finite observations");
2954        }
2955
2956
21
        if (is_pearson) {
2957          // Welford's Method for Pearson Correlation
2958
12
          double mean_x = 0.0, mean_y = 0.0, M2_x = 0.0, M2_y = 0.0, cov = 0.0;
2959
72
          for (size_t i = 0; i < n; i++) {
2960
60
                   double dx = x[i] - mean_x;
2961
60
                   mean_x += dx / (i + 1);
2962
60
                   double dy = y[i] - mean_y;
2963
60
                   mean_y += dy / (i + 1);
2964
60
                   M2_x += dx * (x[i] - mean_x);
2965
60
                   M2_y += dy * (y[i] - mean_y);
2966
60
                   cov  += dx * (y[i] - mean_y);
2967          }
2968
12
          estimate = (M2_x > 0.0 && M2_y > 0.0) ? cov / sqrt(M2_x * M2_y) : 0.0;
2969
12
          df = n - 2;
2970
12
          statistic = estimate * sqrt(df / (1.0 - estimate * estimate));
2971
2972          // Confidence interval using Fisher's Z transform
2973
12
          double z = 0.5 * log((1.0 + estimate) / (1.0 - estimate));
2974
12
          double se = 1.0 / sqrt(n - 3);
2975
12
          double alpha = 1.0 - conf_level;
2976
12
          double q = inverse_normal_cdf(1.0 - alpha/2.0);
2977
12
          ci_lower = tanh(z - q * se);
2978
12
          ci_upper = tanh(z + q * se);
2979
2980          // HIGH-PRECISION P-VALUE USING INCOMPLETE BETA
2981
12
          p_value = get_t_pvalue(statistic, df, alternative);
2982
9
        } else if (is_kendall) {
2983
6
          int c = 0, d = 0, tie_x = 0, tie_y = 0;
2984
30
          for (size_t i = 0; i < n - 1; i++) {
2985
84
                   for (size_t j = i + 1; j < n; j++) {
2986
60
                       double sign_x = (x[i] - x[j] > 0) - (x[i] - x[j] < 0);
2987
60
                       double sign_y = (y[i] - y[j] > 0) - (y[i] - y[j] < 0);
2988
2989
60
                       if (sign_x == 0 && sign_y == 0) { /* Joint tie, ignore */ }
2990
60
                       else if (sign_x == 0) tie_x++;
2991
60
                       else if (sign_y == 0) tie_y++;
2992
60
                       else if (sign_x * sign_y > 0) c++;
2993
12
                       else d++;
2994                   }
2995          }
2996
6
          double denom = sqrt((double)(c + d + tie_x) * (double)(c + d + tie_y));
2997
6
          estimate = (denom == 0.0) ? (0.0/0.0) : (double)(c - d) / denom;
2998
2999
6
          bool has_ties = (tie_x > 0 || tie_y > 0);
3000          bool do_exact;
3001
3002          /* Mirror R: exact defaults to TRUE if N < 50 and NO ties */
3003
6
          if (!exact_sv || !SvOK(exact_sv)) {
3004
6
                   do_exact = (n < 50) && !has_ties;
3005          } else {
3006
0
                   do_exact = SvTRUE(exact_sv) ? 1 : 0;
3007          }
3008          // If forced exact but ties exist, R overrides and falls back to approximation anyway
3009
6
          if (do_exact && has_ties) do_exact = 0;
3010
3011
6
          if (do_exact) {
3012
6
                   double S_stat = c - d;
3013
6
                   statistic = c;
3014
6
                   p_value = kendall_exact_pvalue(n, S_stat, alternative);
3015          } else {
3016                   // Normal approximation for large N or ties
3017
0
                   double var_S = n * (n - 1) * (2.0 * n + 5.0) / 18.0;
3018
0
                   double S = c - d;
3019
0
                   if (continuity) S -= (S > 0 ? 1 : -1);
3020
0
                   statistic = S / sqrt(var_S);
3021
3022
0
                   if (strcmp(alternative, "two.sided") == 0) {
3023
0
                       p_value = 2.0 * (1.0 - approx_pnorm(fabs(statistic)));
3024
0
                   } else if (strcmp(alternative, "less") == 0) {
3025
0
                       p_value = approx_pnorm(statistic);
3026                   } else {
3027
0
                       p_value = 1.0 - approx_pnorm(statistic);
3028                   }
3029          }
3030
3
        } else if (is_spearman) {
3031
3
          double *restrict rank_x = safemalloc(n * sizeof(double));
3032
3
          double *restrict rank_y = safemalloc(n * sizeof(double));
3033
3
          compute_ranks(x, rank_x, n);
3034
3
          compute_ranks(y, rank_y, n);
3035
3036          // Spearman rho = Pearson r of the ranks (Welford's Method)
3037
3
          double mean_x = 0.0, mean_y = 0.0, M2_x = 0.0, M2_y = 0.0, cov = 0.0;
3038
18
          for (size_t i = 0; i < n; i++) {
3039
15
                   double dx = rank_x[i] - mean_x;
3040
15
                   mean_x += dx / (i + 1);
3041
15
                   double dy = rank_y[i] - mean_y;
3042
15
                   mean_y += dy / (i + 1);
3043
15
                   M2_x += dx * (rank_x[i] - mean_x);
3044
15
                   M2_y += dy * (rank_y[i] - mean_y);
3045
15
                   cov  += dx * (rank_y[i] - mean_y);
3046          }
3047
3
          estimate = (M2_x > 0.0 && M2_y > 0.0) ? cov / sqrt(M2_x * M2_y) : 0.0;
3048
3049          // S = sum of squared rank differences (R's reported statistic)
3050
3
          double S_stat = 0.0;
3051
18
          for (size_t i = 0; i < n; i++) {
3052
15
                   double diff = rank_x[i] - rank_y[i];
3053
15
                   S_stat += diff * diff;
3054          }
3055
3056          // Ties produce fractional (averaged) ranks — detect them
3057
3
          bool has_ties = 0, do_exact;
3058
18
          for (size_t i = 0; i < n; i++) {
3059
15
                   if (rank_x[i] != floor(rank_x[i]) || rank_y[i] != floor(rank_y[i])) {
3060
0
                       has_ties = 1;
3061
0
                       break;
3062                   }
3063          }
3064
3
          if (!exact_sv || !SvOK(exact_sv)) {
3065
3
                   do_exact = (n < 10) && !has_ties;
3066          } else {
3067
0
                   do_exact = SvTRUE(exact_sv) ? 1 : 0;
3068          }
3069
3070
3
          if (do_exact) {
3071
3
                   statistic = S_stat;
3072
3
                   p_value   = spearman_exact_pvalue(S_stat, n, alternative);
3073          } else {
3074
0
                   double r = estimate;
3075
0
                   if (continuity)
3076
0
                       r *= (1.0 - 1.0 / (2.0 * (n - 1)));
3077
0
                   statistic = r * sqrt((n - 2.0) / (1.0 - r * r));
3078
0
                   p_value = get_t_pvalue(statistic, (double)(n - 2), alternative);
3079          }
3080
3
          Safefree(rank_x); Safefree(rank_y);
3081        } else {
3082
0
          Safefree(x); Safefree(y);
3083
0
          croak("Unknown method");
3084        }
3085
21
        Safefree(x); Safefree(y);
3086
21
        rhv = newHV();
3087
21
        hv_stores(rhv, "estimate", newSVnv(estimate));
3088
21
        hv_stores(rhv, "p.value", newSVnv(p_value));
3089
21
        hv_stores(rhv, "statistic", newSVnv(statistic));
3090
21
        hv_stores(rhv, "method", newSVpv(method, 0));
3091
21
        hv_stores(rhv, "alternative", newSVpv(alternative, 0));
3092
21
        if (is_pearson) {
3093
12
          hv_stores(rhv, "parameter", newSVnv(df));
3094
12
          AV *restrict ci_av = newAV();
3095
12
          av_push(ci_av, newSVnv(ci_lower));
3096
12
          av_push(ci_av, newSVnv(ci_upper));
3097
12
          hv_stores(rhv, "conf.int", newRV_noinc((SV*)ci_av));
3098        }
3099
3100
21
        RETVAL = newRV_noinc((SV*)rhv);
3101}
3102OUTPUT:
3103    RETVAL
3104
3105void
3106shapiro_test(data)
3107        SV *data
3108PREINIT:
3109        AV *restrict av;
3110        HV *restrict ret_hash;
3111
6
        size_t n_raw, n = 0;
3112
6
        double *restrict x, w = 0.0, p_val = 0.0, mean = 0.0, ssq = 0.0;
3113PPCODE:
3114
6
        if (!SvROK(data) || SvTYPE(SvRV(data)) != SVt_PVAV) {
3115
0
          croak("Expected an array reference");
3116        }
3117
3118
6
        av = (AV *)SvRV(data);
3119
6
        n_raw = av_len(av) + 1;
3120
3121
6
        Newx(x, n_raw, double);
3122
3123        // Extract variables and calculate mean (skipping undefined/NaN values)
3124
78
        for (size_t i = 0; i < n_raw; i++) {
3125
72
          SV **restrict elem = av_fetch(av, i, 0);
3126
72
          if (elem && SvOK(*elem)) {
3127
72
                   double val = SvNV(*elem);
3128
72
                   if (!isnan(val)) {
3129
72
                       x[n] = val;
3130
72
                       mean += val;
3131
72
                       n++;
3132                   }
3133          }
3134        }
3135
3136
6
        if (n < 3 || n > 5000) {
3137
0
          Safefree(x);
3138
0
          croak("Sample size must be between 3 and 5000 (R's limit)");
3139        }
3140
3141
6
        mean /= n;
3142        // Calculate Sum of Squares */
3143
78
        for (size_t i = 0; i < n; i++) {
3144
72
          ssq += (x[i] - mean) * (x[i] - mean);
3145        }
3146
6
        if (ssq == 0.0) {
3147
0
          Safefree(x);
3148
0
          croak("Data is perfectly constant; cannot compute Shapiro-Wilk test");
3149        }
3150
6
        qsort(x, n, sizeof(double), compare_doubles);
3151
3152        // --- Core AS R94 Algorithm: Weights and Statistic W ---
3153
6
        if (n == 3) {
3154
0
          double a_val = 0.7071067811865475; /* sqrt(1/2) */
3155
0
          double b_val = a_val * (x[2] - x[0]);
3156
0
          w = (b_val * b_val) / ssq;
3157
0
          if (w < 0.75) w = 0.75;
3158          // Exact P-value for n=3
3159
0
          p_val = 1.90985931710274 * (asin(sqrt(w)) - 1.04719755119660);
3160        } else {
3161          double *restrict m, *restrict a;
3162
6
          double sum_m2 = 0.0, b_val = 0.0;
3163
6
          Newx(m, n, double);
3164
6
          Newx(a, n, double);
3165
78
          for (size_t i = 0; i < n; i++) {
3166
72
                   m[i] = inverse_normal_cdf((i + 1.0 - 0.375) / (n + 0.25));
3167
72
                   sum_m2 += m[i] * m[i];
3168          }
3169
6
          double u = 1.0 / sqrt((double)n);
3170
6
          double a_n = -2.706056*pow(u,5) + 4.434685*pow(u,4) - 2.071190*pow(u,3) - 0.147981*pow(u,2) + 0.221157*u + m[n-1]/sqrt(sum_m2);
3171
6
          a[n-1] = a_n;
3172
6
          a[0]   = -a_n;
3173
9
          if (n == 4 || n == 5) {
3174
3
                   double eps = (sum_m2 - 2.0 * m[n-1]*m[n-1]) / (1.0 - 2.0 * a_n*a_n);
3175
12
                   for (unsigned int i = 1; i < n-1; i++) {
3176
9
                       a[i] = m[i] / sqrt(eps);
3177                   }
3178          } else {
3179
3
                   double a_n1 = -3.582633*pow(u,5) + 5.682633*pow(u,4) - 1.752461*pow(u,3) - 0.293762*pow(u,2) + 0.042981*u + m[n-2]/sqrt(sum_m2);
3180
3
                   a[n-2] = a_n1;
3181
3
                   a[1]   = -a_n1;
3182
3
                   double eps = (sum_m2 - 2.0 * m[n-1]*m[n-1] - 2.0 * m[n-2]*m[n-2]) / (1.0 - 2.0 * a_n*a_n - 2.0 * a_n1*a_n1);
3183
48
                   for (unsigned int i = 2; i < n-2; i++) {
3184
45
                       a[i] = m[i] / sqrt(eps);
3185                   }
3186          }
3187
78
          for (size_t i = 0; i < n; i++) {
3188
72
                   b_val += a[i] * x[i];
3189          }
3190
6
          w = (b_val * b_val) / ssq;
3191        // --- AS R94 P-Value Calculation: High Precision Refinement ---
3192          /* NOTE: p_val is declared in PREINIT above;
3193                * do NOT shadow it with a local 'double p_val' here or the result will never reach the caller.
3194                */
3195
6
          double y = log(1.0 - w);
3196          double z;
3197
6
          if (n <= 11) {
3198                   // Royston's branch for 4 <= n <= 11 (AS R94, small-sample path).
3199                   // gamma is the upper bound on y = log(1-W);
3200                   // if y reaches gamma the p-value is essentially zero
3201
3
                   double nn = (double)n;
3202
3
                   double gamma = 0.459 * nn - 2.273;
3203
3
                   if (y >= gamma) {
3204
0
                       p_val = 1e-19;
3205                   } else {
3206                       // Horner-form polynomials in n for mu and log(sigma)
3207
3
                       double mu     = 0.544  + nn * (-0.39978  + nn * ( 0.025054  - nn * 0.0006714));
3208
3
                       double sig_val= 1.3822 + nn * (-0.77857  + nn * ( 0.062767  - nn * 0.0020322));
3209
3
                       double sigma  = exp(sig_val);
3210
3
                       z = (-log(gamma - y) - mu) / sigma;
3211                       /* Upper-tail probability P(Z > z): small W → large z → small p-value.
3212                       */
3213
3
                       p_val = 0.5 * erfc(z * M_SQRT1_2);
3214                   }
3215          } else {
3216                   // Royston's branch for n >= 12 (AS R94, large-sample path)
3217
3
                   double ln_n   = log((double)n);
3218                   // Horner-form polynomials in log(n) for mu and log(sigma). */
3219
3
                   double mu     = -1.5861 + ln_n * (-0.31082 + ln_n * (-0.083751 + ln_n * 0.0038915));
3220
3
                   double sig_val= -0.4803 + ln_n * (-0.082676 + ln_n * 0.0030302);
3221
3
                   double sigma  = exp(sig_val);
3222
3
                   z = (y - mu) / sigma;
3223
3
                   p_val = 0.5 * erfc(z * M_SQRT1_2);
3224          }
3225          // Clamp the p-value
3226
6
          if (p_val > 1.0) p_val = 1.0;
3227
6
          if (p_val < 0.0) p_val = 0.0;
3228
3229
6
          Safefree(m); m = NULL;  Safefree(a); a = NULL;
3230        }
3231
6
        Safefree(x); x = NULL;
3232
6
        ret_hash = newHV();
3233
6
        hv_stores(ret_hash, "statistic", newSVnv(w));
3234
6
        hv_stores(ret_hash, "W",         newSVnv(w));
3235
6
        hv_stores(ret_hash, "p_value",   newSVnv(p_val));
3236
6
        hv_stores(ret_hash, "p.value",   newSVnv(p_val));
3237
6
        EXTEND(SP, 1);
3238
6
        PUSHs(sv_2mortal(newRV_noinc((SV *)ret_hash)));
3239
3240double min(...)
3241        PROTOTYPE: @
3242        INIT:
3243
30
                double min_val = 0.0;
3244
30
                size_t count = 0;
3245
30
                bool first = TRUE;
3246        CODE:
3247
30102
                for (unsigned short int i = 0; i < items; i++) {
3248
30072
                        SV* restrict arg = ST(i);
3249
30081
                        if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3250
9
                                AV* restrict av = (AV*)SvRV(arg);
3251
9
                                size_t len = av_len(av) + 1;
3252
642
                                for (size_t j = 0; j < len; j++) {
3253
633
                                  SV** restrict tv = av_fetch(av, j, 0);
3254
633
                                  if (tv && SvOK(*tv)) {
3255
633
                                                double val = SvNV(*tv);
3256
633
                                                if (first || val < min_val) {
3257
9
                                                         min_val = val;
3258
9
                                                         first = FALSE;
3259                                                }
3260
633
                                                count++;
3261                                  }
3262                                }
3263
30063
                        } else if (SvOK(arg)) {
3264
30063
                                double val = SvNV(arg);
3265
30063
                                if (first || val < min_val) {
3266
48
                                  min_val = val;
3267
48
                                  first = FALSE;
3268                                }
3269
30063
                                count++;
3270                        }
3271                }
3272
30
                if (count == 0) croak("min needs >= 1 numeric element");
3273
27
                RETVAL = min_val;
3274        OUTPUT:
3275                RETVAL
3276
3277double max(...)
3278        PROTOTYPE: @
3279        INIT:
3280
33
          double max_val = 0.0;
3281
33
          size_t count = 0;
3282
33
          bool first = TRUE;
3283        CODE:
3284
30105
          for (size_t i = 0; i < items; i++) {
3285
30072
                   SV* restrict arg = ST(i);
3286
30084
                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3287
12
                       AV* restrict av = (AV*)SvRV(arg);
3288
12
                       size_t len = av_len(av) + 1;
3289
945
                       for (size_t j = 0; j < len; j++) {
3290
933
                           SV** restrict tv = av_fetch(av, j, 0);
3291
933
                           if (tv && SvOK(*tv)) {
3292
933
                               double val = SvNV(*tv);
3293
933
                               if (first || val > max_val) {
3294
42
                                   max_val = val;
3295
42
                                   first = FALSE;
3296                               }
3297
933
                               count++;
3298                           }
3299                       }
3300
30060
                   } else if (SvOK(arg)) {
3301
30060
                       double val = SvNV(arg);
3302
30060
                       if (first || val > max_val) {
3303
89
                           max_val = val;
3304
89
                           first = FALSE;
3305                       }
3306
30060
                       count++;
3307                   }
3308          }
3309
33
          if (count == 0) croak("max needs >= 1 numeric element");
3310
30
          RETVAL = max_val;
3311        OUTPUT:
3312          RETVAL
3313
3314SV* runif(...)
3315CODE:
3316{
3317
12
        size_t n = 0;
3318
12
        double min = 0.0, max = 1.0;
3319
3320        // Flags to track what has been assigned
3321
12
        bool n_set = 0, min_set = 0, max_set = 0;
3322
3323
12
        unsigned int i = 0;
3324
3325
12
        if (items == 0) {
3326
0
          croak("Usage: runif(n, [min=0], [max=1]) or runif(n => $n, ...)");
3327        }
3328
3329
42
        while (i < items) {
3330                // 1. Check if the current argument is a string key for a named parameter
3331
30
                if (i + 1 < items && SvPOK(ST(i))) {
3332
18
                        char *restrict key = SvPV_nolen(ST(i));
3333
18
                        if (strEQ(key, "n")) {
3334
6
                                n = (size_t)SvUV(ST(i+1));
3335
6
                                n_set = 1;
3336
6
                                i += 2;
3337
6
                                continue;
3338
12
                        } else if (strEQ(key, "min")) {
3339
6
                                min = SvNV(ST(i+1));
3340
6
                                min_set = 1;
3341
6
                                i += 2;
3342
6
                                continue;
3343
6
                        } else if (strEQ(key, "max")) {
3344
6
                                max = SvNV(ST(i+1));
3345
6
                                max_set = 1;
3346
6
                                i += 2;
3347
6
                                continue;
3348                        }
3349                }
3350
3351                // 2. Fallback to positional parsing if it's not a recognized key
3352
12
                if (!n_set) {
3353
6
                        n = (size_t)SvUV(ST(i));
3354
6
                        n_set = 1;
3355
6
                } else if (!min_set) {
3356
3
                        min = SvNV(ST(i));
3357
3
                        min_set = 1;
3358
3
                } else if (!max_set) {
3359
3
                        max = SvNV(ST(i));
3360
3
                        max_set = 1;
3361                } else {
3362
0
                        croak("Too many arguments or unrecognized parameter passed to runif()");
3363                }
3364
12
                i++;
3365        }
3366
12
        if (!n_set) {
3367
0
                croak("runif() requires at least the 'n' parameter");
3368        }
3369        // Ensure PRNG is seeded
3370
12
        AUTO_SEED_PRNG();
3371
12
        AV *restrict results = newAV();
3372
12
        if (n > 0) {
3373
12
                av_extend(results, n - 1);
3374        }
3375
12
        const double range = max - min;
3376
60060
        for (size_t j = 0; j < n; j++) {
3377                double r;
3378
60048
                if (max < min) {
3379
0
                        r = NAN; // R behavior for inverted ranges
3380                } else {
3381
60048
                        r = min + range * Drand01();
3382                }
3383
60048
                av_push(results, newSVnv(r));
3384        }
3385
12
        RETVAL = newRV_noinc((SV*)results);
3386}
3387OUTPUT:
3388    RETVAL
3389
3390SV* rbinom(...)
3391        CODE:
3392        {
3393          // Auto-seed the PRNG if the Perl script hasn't done so yet
3394
36
          AUTO_SEED_PRNG();
3395
36
          if (items % 2 != 0)
3396
3
                   croak("Usage: rbinom(n => 10, size => 100, prob => 0.5)");
3397          //Parse named arguments
3398
33
          size_t n = 0, size = 0;
3399
33
          double prob = 0.5;
3400
3401
33
          bool size_set = false, prob_set = false;
3402
3403
126
          for (unsigned short i = 0; i < items; i += 2) {
3404
93
                   const char* restrict key = SvPV_nolen(ST(i));
3405
93
                   SV* restrict val = ST(i + 1);
3406
3407
93
                   if      (strEQ(key, "n"))      n    = (unsigned int)SvUV(val);
3408
60
                   else if (strEQ(key, "size")) { size = (unsigned int)SvUV(val); size_set = true; }
3409
30
                   else if (strEQ(key, "prob")) { prob = SvNV(val); prob_set = true; }
3410
0
                   else croak("rbinom: unknown argument '%s'", key);
3411          }
3412
3413          // R requires size and prob to be explicitly passed in rbinom
3414
33
          if (!size_set || !prob_set) croak("rbinom: 'size' and 'prob' are required arguments");
3415
27
          if (prob < 0.0 || prob > 1.0) croak("rbinom: prob must be between 0 and 1");
3416
3417
21
          AV *restrict result_av = newAV();
3418
21
          if (n > 0) {
3419
21
                   av_extend(result_av, n - 1);
3420
61518
                   for (unsigned int i = 0; i < n; i++) {
3421
61497
                       av_store(result_av, i, newSVuv(generate_binomial(size, prob)));
3422                   }
3423          }
3424
3425
21
          RETVAL = newRV_noinc((SV*)result_av);
3426        }
3427        OUTPUT:
3428          RETVAL
3429
3430SV*
3431hist(SV* x_sv, ...)
3432        CODE:
3433        {
3434                // 1. Validate Input
3435
27
                if (!SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
3436
6
                        croak("hist: first argument must be an array reference");
3437
3438
21
                AV*restrict x_av = (AV*)SvRV(x_sv);
3439
21
                size_t n_raw = av_len(x_av) + 1;
3440
21
                if (n_raw == 0) croak("hist: input array is empty");
3441
3442                // 2. Extract Data & Find Range
3443                double *restrict x;
3444
18
                Newx(x, n_raw, double);
3445
18
                size_t n = 0;
3446
18
                double min_val = DBL_MAX, max_val = -DBL_MAX;
3447
3448
6078
                for (size_t i = 0; i < n_raw; i++) {
3449
6063
                        SV**restrict tv = av_fetch(x_av, i, 0);
3450
6063
                        if (tv && SvOK(*tv)) {
3451
6063
                                 double val = SvNV(*tv);
3452
6060
                                 x[n++] = val;
3453
6060
                                 if (val < min_val) min_val = val;
3454
6060
                                 if (val > max_val) max_val = val;
3455                        }
3456                }
3457
15
                if (n == 0) {
3458
0
                        Safefree(x);
3459
0
                        croak("hist: input contains no valid numeric data");
3460                }
3461                // 3. Determine Bin Count (Sturges default or user-provided)
3462
15
                size_t n_bins = 0;
3463
3464
15
                if (items == 2) {
3465                        // Support pure positional argument: hist($data, 22)
3466
0
                        n_bins = (size_t)SvIV(ST(1));
3467
15
                } else if (items > 2) {
3468                        /* Support named parameters even if mixed with positional arguments */
3469
15
                        for (unsigned short i = 1; i < items - 1; i++) {
3470                                 /* Make sure the SV holds a string before doing string comparison */
3471
15
                                 if (SvPOK(ST(i)) && strEQ(SvPV_nolen(ST(i)), "breaks")) {
3472
15
                                     n_bins = (size_t)SvIV(ST(i+1));
3473
15
                                     break;
3474                                 }
3475                        }
3476                        /* Fallback: if 'breaks' wasn't found but a positional number was given first */
3477
15
                        if (n_bins == 0 && looks_like_number(ST(1))) {
3478
0
                                 n_bins = (size_t)SvIV(ST(1));
3479                        }
3480                }
3481
15
                if (n_bins == 0) n_bins = calculate_sturges_bins(n);
3482                // 4. Allocate Result Arrays
3483                double *restrict breaks, *restrict mids, *restrict density;
3484                size_t *restrict counts;
3485
15
                Newx(breaks,  n_bins + 1, double);
3486
15
                Newx(mids,    n_bins,     double);
3487
15
                Newx(density, n_bins,     double);
3488
15
                Newx(counts,  n_bins,     size_t);
3489
3490                // Generate simple linear breaks
3491
15
                double step = (max_val - min_val) / (double)n_bins;
3492
84
                for (size_t i = 0; i <= n_bins; i++) {
3493
69
                        breaks[i] = min_val + (double)i * step;
3494                }
3495
3496                // 5. Compute Statistics
3497
15
                compute_hist_logic(x, n, breaks, n_bins, counts, mids, density);
3498
3499                // 6. Build Return HashRef
3500
15
                HV*restrict res_hv = newHV();
3501
15
                AV*restrict av_breaks  = newAV();
3502
15
                AV*restrict av_counts  = newAV();
3503
15
                AV*restrict av_mids    = newAV();
3504
15
                AV*restrict av_density = newAV();
3505
84
                for (size_t i = 0; i <= n_bins; i++) {
3506
69
                        av_push(av_breaks, newSVnv(breaks[i]));
3507
69
                        if (i < n_bins) {
3508
54
                                 av_push(av_counts,  newSViv(counts[i]));
3509
54
                                 av_push(av_mids,    newSVnv(mids[i]));
3510
54
                                 av_push(av_density, newSVnv(density[i]));
3511                        }
3512                }
3513
15
                hv_stores(res_hv, "breaks",  newRV_noinc((SV*)av_breaks));
3514
15
                hv_stores(res_hv, "counts",  newRV_noinc((SV*)av_counts));
3515
15
                hv_stores(res_hv, "mids",    newRV_noinc((SV*)av_mids));
3516
15
                hv_stores(res_hv, "density", newRV_noinc((SV*)av_density));
3517
3518                // Clean
3519
15
                Safefree(x); Safefree(breaks); Safefree(mids);
3520
15
                Safefree(density); Safefree(counts);
3521
3522
15
                RETVAL = newRV_noinc((SV*)res_hv);
3523        }
3524        OUTPUT:
3525          RETVAL
3526
3527SV* quantile(...)
3528        CODE:
3529        {
3530
12
                SV *restrict x_sv = NULL;
3531
12
                SV *restrict probs_sv = NULL;
3532
12
                int arg_idx = 0;
3533
3534                /* --- 1. Consume first positional arg as 'x' if it's an array ref --- */
3535
12
                if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
3536
9
                         x_sv = ST(arg_idx);
3537
9
                         arg_idx++;
3538                }
3539
3540                /* --- 2. Remaining args must be key-value pairs --- */
3541
12
                if ((items - arg_idx) % 2 != 0)
3542
0
                         croak("Usage: quantile(\\@data, probs => \\@probs)  OR  quantile(x => \\@data, probs => \\@probs)");
3543
3544
27
                for (; arg_idx < items; arg_idx += 2) {
3545
15
                         const char *restrict key = SvPV_nolen(ST(arg_idx));
3546
15
                         SV *restrict val = ST(arg_idx + 1);
3547
3548
15
                         if      (strEQ(key, "x"))     x_sv     = val;
3549
12
                         else if (strEQ(key, "probs")) probs_sv = val;
3550
0
                         else croak("quantile: unknown argument '%s'", key);
3551                }
3552
12
                if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
3553
0
                        croak("quantile: 'x' must be an array reference");
3554
12
                AV *restrict x_av = (AV*)SvRV(x_sv);
3555
12
                size_t n_raw = av_len(x_av) + 1;
3556
12
                if (n_raw == 0) croak("quantile: 'x' is empty");
3557
3558                /* --- Extract valid numeric data & drop NAs --- */
3559                double *restrict x;
3560
12
                Newx(x, n_raw, double);
3561
12
                size_t n = 0;
3562
624
                for (size_t i = 0; i < n_raw; i++) {
3563
612
                        SV **restrict tv = av_fetch(x_av, i, 0);
3564
612
                        if (tv && SvOK(*tv)) {
3565
612
                                 x[n++] = SvNV(*tv);
3566                        }
3567                }
3568
12
                if (n == 0) {
3569
0
                        Safefree(x);
3570
0
                        croak("quantile: 'x' contains no valid numbers");
3571                }
3572                // --- Sort Data for Quantile Math ---
3573
12
                qsort(x, n, sizeof(double), compare_doubles);
3574                // --- Parse Probabilities (Default matches R's c(0, .25, .5, .75, 1)) ---
3575
12
                double default_probs[] = {0.0, 0.25, 0.50, 0.75, 1.0};
3576
12
                unsigned int n_probs = 5;
3577                double *restrict probs;
3578
3579
24
                if (probs_sv && SvROK(probs_sv) && SvTYPE(SvRV(probs_sv)) == SVt_PVAV) {
3580
12
                        AV *restrict p_av = (AV*)SvRV(probs_sv);
3581
12
                        n_probs = av_len(p_av) + 1;
3582
12
                        Newx(probs, n_probs, double);
3583
39
                        for (unsigned int i = 0; i < n_probs; i++) {
3584
27
                                 SV **tv = av_fetch(p_av, i, 0);
3585
27
                                 probs[i] = (tv && SvOK(*tv)) ? SvNV(*tv) : 0.0;
3586
27
                                 if (probs[i] < 0.0 || probs[i] > 1.0) {
3587
0
                                     Safefree(x); Safefree(probs);
3588
0
                                     croak("quantile: probabilities must be between 0 and 1");
3589                                 }
3590                        }
3591                } else {
3592
0
                        Newx(probs, n_probs, double);
3593
0
                        for (unsigned int i = 0; i < n_probs; i++) probs[i] = default_probs[i];
3594                }
3595
3596                /* --- Calculate Quantiles (R Type 7 Algorithm) --- */
3597
12
                HV *restrict res_hv = newHV();
3598
3599
39
                for (size_t i = 0; i < n_probs; i++) {
3600
27
                        double p = probs[i], q = 0.0;
3601
3602
27
                        if (n == 1) {
3603
3
                                 q = x[0];
3604
24
                        } else if (p == 1.0) {
3605
3
                                 q = x[n - 1]; /* Prevent out-of-bounds mapping */
3606
21
                        } else if (p == 0.0) {
3607
3
                                 q = x[0];
3608                        } else {
3609                                 /* Continuous sample quantile interpolation (Type 7) */
3610
18
                                 double h = (n - 1) * p;
3611
18
                                 unsigned int j = (unsigned int)h; /* floor via cast */
3612
18
                                 double gamma = h - j;
3613
18
                                 q = (1.0 - gamma) * x[j] + gamma * x[j + 1];
3614                        }
3615
3616                        /* Format hash key to exactly match R's naming convention ("25%", "33.3%") */
3617                        char key[32];
3618
27
                        double pct = p * 100.0;
3619
3620
27
                        if (pct == (unsigned int)pct) {
3621
27
                                 snprintf(key, sizeof(key), "%.0f%%", pct);
3622                        } else {
3623
0
                                 snprintf(key, sizeof(key), "%.1f%%", pct);
3624                        }
3625
3626
27
                        hv_store(res_hv, key, strlen(key), newSVnv(q), 0);
3627                }
3628
3629
12
                Safefree(x);
3630
12
                Safefree(probs);
3631
3632
12
                RETVAL = newRV_noinc((SV*)res_hv);
3633        }
3634        OUTPUT:
3635          RETVAL
3636
3637
3638double mean(...)
3639        PROTOTYPE: @
3640        INIT:
3641
117
          double total = 0;
3642
117
          size_t count = 0;
3643        CODE:
3644
267
          for (size_t i = 0; i < items; i++) {
3645
150
                   SV*restrict arg = ST(i);
3646
261
                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3647
111
                       AV*restrict av = (AV*)SvRV(arg);
3648
111
                       size_t len = av_len(av) + 1;
3649
60915
                       for (size_t j = 0; j < len; j++) {
3650
60804
                           SV**restrict tv = av_fetch(av, j, 0);
3651
60804
                           if (tv && SvOK(*tv)) { total += SvNV(*tv); count++; }
3652                       }
3653
39
                   } else if (SvOK(arg)) {
3654
39
                       total += SvNV(arg); count++;
3655                   }
3656          }
3657
117
          if (count == 0) croak("mean needs >= 1 element");
3658
114
          RETVAL = total / count;
3659        OUTPUT:
3660          RETVAL
3661
3662double sd(...)
3663        PROTOTYPE: @
3664        INIT:
3665
15
          double mean = 0.0, M2 = 0.0;
3666
15
          size_t count = 0;
3667        CODE:
3668          // Single Pass Standard Deviation via Welford's Algorithm
3669
66
          for (size_t i = 0; i < items; i++) {
3670
51
                   SV* restrict arg = ST(i);
3671
54
                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3672
3
                       AV*restrict av = (AV*)SvRV(arg);
3673
3
                       size_t len = av_len(av) + 1;
3674
30000
                       for (size_t j = 0; j < len; j++) {
3675
29997
                           SV**restrict tv = av_fetch(av, j, 0);
3676
29997
                           if (tv && SvOK(*tv)) {
3677
29997
                               count++;
3678
29997
                               double val = SvNV(*tv);
3679
29997
                               double delta = val - mean;
3680
29997
                               mean += delta / count;
3681
29997
                               M2 += delta * (val - mean);
3682                           }
3683                       }
3684
48
                   } else if (SvOK(arg)) {
3685
48
                       count++;
3686
48
                       double val = SvNV(arg);
3687
48
                       double delta = val - mean;
3688
48
                       mean += delta / count;
3689
48
                       M2 += delta * (val - mean);
3690                   }
3691          }
3692
15
          if (count < 2) croak("stdev needs >= 2 elements");
3693
12
          RETVAL = sqrt(M2 / (count - 1));
3694        OUTPUT:
3695          RETVAL
3696
3697double var(...)
3698        PROTOTYPE: @
3699        INIT:
3700
18
          double mean = 0.0, M2 = 0.0;
3701
18
          size_t count = 0;
3702        CODE:
3703          // Single Pass Variance via Welford's Algorithm
3704
51
          for (size_t i = 0; i < items; i++) {
3705
33
                   SV*restrict arg = ST(i);
3706
42
                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3707
9
                       AV*restrict av = (AV*)SvRV(arg);
3708
9
                       size_t len = av_len(av) + 1;
3709
30039
                       for (size_t j = 0; j < len; j++) {
3710
30030
                           SV**restrict tv = av_fetch(av, j, 0);
3711
30030
                           if (tv && SvOK(*tv)) {
3712
30030
                               count++;
3713
30030
                               double val = SvNV(*tv);
3714
30030
                               double delta = val - mean;
3715
30030
                               mean += delta / count;
3716
30030
                               M2 += delta * (val - mean);
3717                           }
3718                       }
3719
24
                   } else if (SvOK(arg)) {
3720
24
                       count++;
3721
24
                       double val = SvNV(arg);
3722
24
                       double delta = val - mean;
3723
24
                       mean += delta / count;
3724
24
                       M2 += delta * (val - mean);
3725                   }
3726          }
3727
18
          if (count < 2) croak("var needs >= 2 elements");
3728
15
          RETVAL = M2 / (count - 1);
3729        OUTPUT:
3730          RETVAL
3731
3732SV* t_test(...)
3733        CODE:
3734        {
3735
141
                SV*restrict x_sv = NULL;
3736
141
                SV*restrict y_sv = NULL;
3737
141
                double mu = 0.0, conf_level = 0.95;
3738
141
                bool paired = FALSE, var_equal = FALSE;
3739
141
                const char*restrict alternative = "two.sided";
3740
3741
141
                int arg_idx = 0;
3742
3743                // 1. Shift first positional argument as 'x' if it's an array reference
3744
141
                if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
3745
63
                  x_sv = ST(arg_idx);
3746
63
                  arg_idx++;
3747                }
3748
3749                // 2. Shift second positional argument as 'y' if it's an array reference
3750
141
                if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
3751
12
                  y_sv = ST(arg_idx);
3752
12
                  arg_idx++;
3753                }
3754
3755                // Ensure the remaining arguments form complete key-value pairs
3756
141
                if ((items - arg_idx) % 2 != 0) {
3757
0
                  croak("Usage: t_test(\\@x, [\\@y], key => value, ...)");
3758                }
3759
3760                // --- Parse named arguments from the remaining flat stack ---
3761
369
                for (; arg_idx < items; arg_idx += 2) {
3762
228
                        const char*restrict key = SvPV_nolen(ST(arg_idx));
3763
228
                        SV*restrict val = ST(arg_idx + 1);
3764
3765
228
                        if      (strEQ(key, "x"))           x_sv        = val;
3766
153
                        else if (strEQ(key, "y"))           y_sv        = val;
3767
138
                        else if (strEQ(key, "mu"))          mu          = SvNV(val);
3768
33
                        else if (strEQ(key, "paired"))      paired      = SvTRUE(val);
3769
21
                        else if (strEQ(key, "var_equal"))   var_equal   = SvTRUE(val);
3770
12
                        else if (strEQ(key, "conf_level"))  conf_level  = SvNV(val);
3771
6
                        else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
3772
0
                        else croak("t_test: unknown argument '%s'", key);
3773                }
3774
3775                // --- Validate required / types ---
3776
141
                if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
3777
3
                        croak("t_test: 'x' is a required argument and must be an ARRAY reference");
3778
138
                AV*restrict x_av = (AV*)SvRV(x_sv);
3779
138
                size_t nx = av_len(x_av) + 1;
3780
138
                if (nx < 2) croak("t_test: 'x' needs at least 2 elements");
3781
138
                AV*restrict y_av = NULL;
3782
138
                if (y_sv && SvROK(y_sv) && SvTYPE(SvRV(y_sv)) == SVt_PVAV)
3783
24
                        y_av = (AV*)SvRV(y_sv);
3784
3785
138
                if (conf_level <= 0.0 || conf_level >= 1.0)
3786
3
                        croak("t_test: 'conf_level' must be between 0 and 1");
3787                // --- Computation via Welford's Algorithm --- */
3788
135
                double mean_x = 0.0, M2_x = 0.0, var_x, t_stat, df, p_val, std_err, cint_est;
3789
135
                HV*restrict results = newHV();
3790
1233
                for (size_t i = 0; i < nx; i++) {
3791
1098
                        SV**restrict tv = av_fetch(x_av, i, 0);
3792
1098
                        double val = (tv && SvOK(*tv)) ? SvNV(*tv) : 0;
3793
1098
                        double delta = val - mean_x;
3794
1098
                        mean_x += delta / (i + 1);
3795
1098
                        M2_x += delta * (val - mean_x);
3796                }
3797
135
                var_x = M2_x / (nx - 1);
3798
135
                if (var_x == 0.0 && !y_av) croak("t_test: data are essentially constant");
3799
3800
153
                if (paired || y_av) {
3801
27
                        if (!y_av) croak("t_test: 'y' must be provided for paired or two-sample tests");
3802
24
                        size_t ny = av_len(y_av) + 1;
3803
24
                        if (paired && ny != nx) croak("t_test: Paired arrays must be same length");
3804
21
                        double mean_y = 0.0, M2_y = 0.0, var_y;
3805
312
                        for (size_t i = 0; i < ny; i++) {
3806
291
                                 SV**restrict tv = av_fetch(y_av, i, 0);
3807
291
                                 double val = (tv && SvOK(*tv)) ? SvNV(*tv) : 0;
3808
291
                                 double delta = val - mean_y;
3809
291
                                 mean_y += delta / (i + 1);
3810
291
                                 M2_y += delta * (val - mean_y);
3811                        }
3812
21
                        var_y = M2_y / (ny - 1);
3813
21
                        if (paired) {
3814
6
                                 double mean_d = 0.0, M2_d = 0.0;
3815
42
                                 for (size_t i = 0; i < nx; i++) {
3816
36
                                          SV**restrict dx_ptr = av_fetch(x_av, i, 0);
3817
36
                                          SV**restrict dy_ptr = av_fetch(y_av, i, 0);
3818
36
                                     double dx = (dx_ptr && SvOK(*dx_ptr)) ? SvNV(*dx_ptr) : 0.0;
3819
36
                                     double dy = (dy_ptr && SvOK(*dy_ptr)) ? SvNV(*dy_ptr) : 0.0;
3820
36
                                     double val = dx - dy;
3821
36
                                     double delta = val - mean_d;
3822
36
                                     mean_d += delta / (i + 1);
3823
36
                                     M2_d += delta * (val - mean_d);
3824                                 }
3825
6
                                 double var_d = M2_d / (nx - 1);
3826
6
                                 if (var_d == 0.0) croak("t_test: data are essentially constant");
3827
6
                                 cint_est = mean_d;
3828
6
                                 std_err  = sqrt(var_d / nx);
3829
6
                                 t_stat   = (cint_est - mu) / std_err;
3830
6
                                 df       = nx - 1;
3831
6
                                 hv_store(results, "estimate", 8, newSVnv(mean_d), 0);
3832
15
                        } else if (var_equal) {
3833
6
                                 if (var_x == 0.0 && var_y == 0.0) croak("t_test: data are essentially constant");
3834
6
                                 double pooled_var = ((nx - 1) * var_x + (ny - 1) * var_y) / (nx + ny - 2);
3835
6
                                 cint_est = mean_x - mean_y;
3836
6
                                 std_err  = sqrt(pooled_var * (1.0 / nx + 1.0 / ny));
3837
6
                                 t_stat   = (cint_est - mu) / std_err;
3838
6
                                 df       = nx + ny - 2;
3839
6
                                 hv_store(results, "estimate_x", 10, newSVnv(mean_x), 0);
3840
6
                                 hv_store(results, "estimate_y", 10, newSVnv(mean_y), 0);
3841                        } else {
3842
9
                                 if (var_x == 0.0 && var_y == 0.0) croak("t_test: data are essentially constant");
3843
9
                                 cint_est         = mean_x - mean_y;
3844
9
                                 double stderr_x2 = var_x / nx;
3845
9
                                 double stderr_y2 = var_y / ny;
3846
9
                                 std_err          = sqrt(stderr_x2 + stderr_y2);
3847
9
                                 t_stat           = (cint_est - mu) / std_err;
3848
9
                                 df = pow(stderr_x2 + stderr_y2, 2) /
3849
9
                                      (pow(stderr_x2, 2) / (nx - 1) + pow(stderr_y2, 2) / (ny - 1));
3850
9
                                 hv_store(results, "estimate_x", 10, newSVnv(mean_x), 0);
3851
9
                                 hv_store(results, "estimate_y", 10, newSVnv(mean_y), 0);
3852                        }
3853                } else {
3854
105
                        cint_est = mean_x;
3855
105
                        std_err  = sqrt(var_x / nx);
3856
105
                        t_stat   = (cint_est - mu) / std_err;
3857
105
                        df       = nx - 1;
3858
105
                        hv_store(results, "estimate", 8, newSVnv(mean_x), 0);
3859                }
3860
126
                p_val = get_t_pvalue(t_stat, df, alternative);
3861
126
                double alpha = 1.0 - conf_level, t_crit, ci_lower, ci_upper;
3862
126
                if (strcmp(alternative, "less") == 0) {
3863
3
                        t_crit   = qt_tail(df, alpha);
3864
3
                        ci_lower = -INFINITY;
3865
3
                        ci_upper = cint_est + t_crit * std_err;
3866
123
                } else if (strcmp(alternative, "greater") == 0) {
3867
3
                        t_crit   = qt_tail(df, alpha);
3868
3
                        ci_lower = cint_est - t_crit * std_err;
3869
3
                        ci_upper = INFINITY;
3870                } else {
3871
120
                        t_crit   = qt_tail(df, alpha / 2.0);
3872
120
                        ci_lower = cint_est - t_crit * std_err;
3873
120
                        ci_upper = cint_est + t_crit * std_err;
3874                }
3875
126
                AV*restrict conf_int = newAV();
3876
126
                av_push(conf_int, newSVnv(ci_lower));
3877
126
                av_push(conf_int, newSVnv(ci_upper));
3878
126
                hv_store(results, "statistic", 9, newSVnv(t_stat), 0);
3879
126
                hv_store(results, "df",        2, newSVnv(df),     0);
3880
126
                hv_store(results, "p_value",   7, newSVnv(p_val),  0);
3881
126
                hv_store(results, "conf_int",  8, newRV_noinc((SV*)conf_int), 0);
3882
126
                RETVAL = newRV_noinc((SV*)results);
3883        }
3884        OUTPUT:
3885          RETVAL
3886
3887void p_adjust(SV* p_sv, const char* method = "holm")
3888        INIT:
3889
45
                if (!SvROK(p_sv) || SvTYPE(SvRV(p_sv)) != SVt_PVAV) {
3890
3
                        croak("p_adjust: first argument must be an ARRAY reference of p-values");
3891                }
3892
42
                AV *restrict p_av = (AV*)SvRV(p_sv);
3893
42
                size_t n = av_len(p_av) + 1;
3894                // Handle empty input
3895
42
                if (n == 0) {
3896
3
                        XSRETURN_EMPTY;
3897                }
3898                // Normalize method string
3899                char meth[64];
3900
39
                strncpy(meth, method, 63); meth[63] = '\0';
3901
471
                for(unsigned short int i = 0; meth[i]; i++) meth[i] = tolower(meth[i]);
3902                // Resolve aliases
3903
39
                if (strstr(meth, "benjamini") && strstr(meth, "hochberg")) strcpy(meth, "bh");
3904
39
                if (strstr(meth, "benjamini") && strstr(meth, "yekutieli")) strcpy(meth, "by");
3905
39
                if (strcmp(meth, "fdr") == 0) strcpy(meth, "bh");
3906                // Allocate C memory
3907                PVal *restrict arr;
3908                double *restrict adj;
3909
39
                Newx(arr, n, PVal);
3910
39
                Newx(adj, n, double);
3911
3912
1107
                for (size_t i = 0; i < n; i++) {
3913
1068
                        SV**restrict tv = av_fetch(p_av, i, 0);
3914
1068
                        arr[i].p = (tv && SvOK(*tv)) ? SvNV(*tv) : 1.0;
3915
1068
                        arr[i].orig_idx = i;
3916                }
3917                // Sort ascending (Stable sort using original index)
3918
39
                qsort(arr, n, sizeof(PVal), cmp_pval);
3919        PPCODE:
3920
39
                if (strcmp(meth, "bonferroni") == 0) {
3921
159
                        for (size_t i = 0; i < n; i++) {
3922
153
                                double v = arr[i].p * n;
3923
153
                                adj[arr[i].orig_idx] = (v < 1.0) ? v : 1.0;
3924                        }
3925
33
                } else if (strcmp(meth, "holm") == 0) {
3926
6
                        double cummax = 0.0;
3927
159
                        for (size_t i = 0; i < n; i++) {
3928
153
                                 double v = arr[i].p * (n - i);
3929
153
                                 if (v > cummax) cummax = v;
3930
153
                                 adj[arr[i].orig_idx] = (cummax < 1.0) ? cummax : 1.0;
3931                        }
3932
27
                } else if (strcmp(meth, "hochberg") == 0) {
3933
6
                        double cummin = 1.0;
3934
159
                        for (ssize_t i = n - 1; i >= 0; i--) {
3935
153
                                 double v = arr[i].p * (n - i);
3936
153
                                 if (v < cummin) cummin = v;
3937
153
                                 adj[arr[i].orig_idx] = (cummin < 1.0) ? cummin : 1.0;
3938                        }
3939
21
                } else if (strcmp(meth, "bh") == 0) {
3940
6
                        double cummin = 1.0;
3941
159
                        for (ssize_t i = n - 1; i >= 0; i--) {
3942
153
                                double v = arr[i].p * n / (i + 1.0);
3943
153
                                if (v < cummin) cummin = v;
3944
153
                                adj[arr[i].orig_idx] = (cummin < 1.0) ? cummin : 1.0;
3945                        }
3946
15
                } else if (strcmp(meth, "by") == 0) {
3947
6
                        double q = 0.0;
3948
159
                        for (size_t i = 1; i <= n; i++) q += 1.0 / i;
3949
6
                        double cummin = 1.0;
3950
159
                        for (ssize_t i = n - 1; i >= 0; i--) {
3951
153
                                double v = arr[i].p * n / (i + 1.0) * q;
3952
153
                                if (v < cummin) cummin = v;
3953
153
                                adj[arr[i].orig_idx] = (cummin < 1.0) ? cummin : 1.0;
3954                        }
3955
9
                } else if (strcmp(meth, "hommel") == 0) {
3956                        double *restrict pa, *restrict q_arr;
3957
6
                        Newx(pa, n, double);
3958
6
                        Newx(q_arr, n, double);
3959                        // Initial: min(n * p[i] / (i + 1))
3960
6
                        double min_val = n * arr[0].p;
3961
153
                        for (size_t i = 1; i < n; i++) {
3962
147
                                double temp = (n * arr[i].p) / (i + 1.0);
3963
147
                                if (temp < min_val) {
3964
0
                                   min_val = temp;
3965                                }
3966                        }
3967                        // pa <- q <- rep(min, n)
3968
159
                        for (size_t i = 0; i < n; i++) {
3969
153
                                 pa[i] = min_val;
3970
153
                                 q_arr[i] = min_val;
3971                        }
3972
150
                        for (size_t j = n - 1; j >= 2; j--) {
3973
144
                                 ssize_t n_mj = n - j;       // Max index for 'ij'. Length is n_mj + 1
3974
144
                                 ssize_t i2_len = j - 1;     // Length of 'i2
3975                                 // Calculate q1 = min(j * p[i2] / (2:j))
3976
144
                                 double q1 = (j * arr[n_mj + 1].p) / 2.0;
3977
3528
                                 for (size_t k = 1; k < i2_len; k++) {
3978
3384
                                     double temp_q1 = (j * arr[n_mj + 1 + k].p) / (2.0 + k);
3979
3384
                                     if (temp_q1 < q1) {
3980
798
                                         q1 = temp_q1;
3981                                     }
3982                                 }
3983                                 // q[ij] <- pmin(j * p[ij], q1)
3984
3816
                                 for (size_t i = 0; i <= n_mj; i++) {
3985
3672
                                     double v = j * arr[i].p;
3986
3672
                                     q_arr[i] = (v < q1) ? v : q1;
3987                                 }
3988                                 // q[i2] <- q[n - j]
3989
3672
                                 for (size_t i = 0; i < i2_len; i++) {
3990
3528
                                     q_arr[n_mj + 1 + i] = q_arr[n_mj];
3991                                }
3992                                 // pa <- pmax(pa, q)
3993
7344
                                for (size_t i = 0; i < n; i++) {
3994
7200
                                    if (pa[i] < q_arr[i]) {
3995
4203
                                       pa[i] = q_arr[i];
3996                                    }
3997                                }
3998                        }
3999                        // pmin(1, pmax(pa, p))[ro] — map sorted results back to original indices
4000
159
                        for (size_t i = 0; i < n; i++) {
4001
153
                                double v = (pa[i] > arr[i].p) ? pa[i] : arr[i].p;
4002
153
                                if (v > 1.0) v = 1.0;
4003
153
                                adj[arr[i].orig_idx] = v;
4004                        }
4005
6
                        Safefree(pa);  Safefree(q_arr);
4006
3
                } else if (strcmp(meth, "none") == 0) {
4007
0
                        for (size_t i = 0; i < n; i++) {
4008
0
                                adj[arr[i].orig_idx] = arr[i].p;
4009                        }
4010                } else {
4011
3
                        Safefree(arr); Safefree(adj);
4012
3
                        croak("Unknown p-value adjustment method: %s", method);
4013                }
4014                // Push values onto the Perl stack as a flat list
4015
36
                EXTEND(SP, n);
4016
954
                for (size_t i = 0; i < n; i++) {
4017
918
                        PUSHs(sv_2mortal(newSVnv(adj[i])));
4018                }
4019
36
                Safefree(arr); arr = NULL;
4020
36
                Safefree(adj); adj = NULL;
4021
4022double median(...)
4023        PROTOTYPE: @
4024        INIT:
4025
18
                size_t total_count = 0, k = 0;
4026                double *restrict nums;
4027
18
                double median_val = 0.0;
4028        CODE:
4029          // Pass 1: Count total valid elements to allocate memory
4030
42
          for (size_t i = 0; i < items; i++) {
4031
24
                   SV*restrict arg = ST(i);
4032
36
                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4033
12
                       AV*restrict av = (AV*)SvRV(arg);
4034
12
                       size_t len = av_len(av) + 1;
4035
129
                       for (size_t j = 0; j < len; j++) {
4036
117
                           SV**restrict tv = av_fetch(av, j, 0);
4037
117
                           if (tv && SvOK(*tv)) { total_count++; }
4038                       }
4039
12
                   } else if (SvOK(arg)) {
4040
12
                       total_count++;
4041                   }
4042          }
4043
18
          if (total_count == 0) croak("median needs >= 1 element");
4044          // Allocate C array now that we know the exact size
4045
15
          Newx(nums, total_count, double);
4046          // Pass 2: Populate the C array
4047
39
          for (size_t i = 0; i < items; i++) {
4048
24
                   SV*restrict arg = ST(i);
4049
36
                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4050
12
                       AV*restrict av = (AV*)SvRV(arg);
4051
12
                       size_t len = av_len(av) + 1;
4052
129
                       for (size_t j = 0; j < len; j++) {
4053
117
                           SV**restrict tv = av_fetch(av, j, 0);
4054
117
                           if (tv && SvOK(*tv)) {
4055
117
                               nums[k++] = SvNV(*tv);
4056                           }
4057                       }
4058
12
                   } else if (SvOK(arg)) {
4059
12
                       nums[k++] = SvNV(arg);
4060                   }
4061          }
4062          // Sort and calculate median
4063
15
          qsort(nums, total_count, sizeof(double), compare_doubles);
4064
15
          if (total_count % 2 == 0) {
4065
12
                   median_val = (nums[total_count / 2 - 1] + nums[total_count / 2]) / 2.0;
4066          } else {
4067
3
                   median_val = nums[total_count / 2];
4068          }
4069
15
          Safefree(nums); nums = NULL;
4070
15
          RETVAL = median_val;
4071        OUTPUT:
4072          RETVAL
4073
4074SV* cor(SV* x_sv, SV* y_sv = &PL_sv_undef, const char* method = "pearson")
4075        INIT:
4076          // --- validate method -------------------------------------------
4077
18
          if (strcmp(method, "pearson")  != 0 &&
4078
9
                   strcmp(method, "spearman") != 0 &&
4079
6
                   strcmp(method, "kendall")  != 0)
4080
3
                   croak("cor: unknown method '%s' (use 'pearson', 'spearman', or 'kendall')",
4081                         method);
4082
4083          // --- validate x ------------------------------------------------
4084
15
          if (!SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
4085
0
                   croak("cor: x must be an ARRAY reference");
4086
4087
15
          AV*restrict x_av = (AV*)SvRV(x_sv);
4088
15
          size_t nx   = av_len(x_av) + 1;
4089
15
          if (nx == 0) croak("cor: x is empty");
4090
4091          // --- detect whether x is a flat vector or a matrix (AoA) -------
4092
15
          bool x_is_matrix = 0;
4093          {
4094
15
                   SV**restrict fp = av_fetch(x_av, 0, 0);
4095
15
                   if (fp && SvROK(*fp) && SvTYPE(SvRV(*fp)) == SVt_PVAV)
4096
3
                       x_is_matrix = 1;
4097          }
4098
4099          // --- detect y ----------------------------
4100
30
          bool has_y = (SvOK(y_sv) && SvROK(y_sv) &&
4101
15
                            SvTYPE(SvRV(y_sv)) == SVt_PVAV);
4102
4103
15
          AV*restrict y_av = has_y ? (AV*)SvRV(y_sv) : NULL;
4104
15
          size_t ny = has_y ? av_len(y_av) + 1 : 0;
4105
4106
15
          bool y_is_matrix = 0;
4107
15
          if (has_y && ny > 0) {
4108
15
                   SV**restrict fp = av_fetch(y_av, 0, 0);
4109
15
                   if (fp && SvROK(*fp) && SvTYPE(SvRV(*fp)) == SVt_PVAV)
4110
3
                       y_is_matrix = 1;
4111          }
4112
4113        CODE:
4114      // Branch 1: both inputs are flat vectors  â†’  scalar result
4115
15
          if (!x_is_matrix && !y_is_matrix) {
4116
12
                   if (!has_y) {
4117                       /* cor(vector) == 1 by definition */
4118
0
                       RETVAL = newSVnv(1.0);
4119                   } else {
4120
12
                       if (nx != ny)
4121
3
                           croak("cor: x and y must have the same length (%lu vs %lu)",
4122                                 nx, ny);
4123
4124
9
                       if (nx < 2)
4125
0
                           croak("cor: need at least 2 observations");
4126
4127                       double *restrict xd, *restrict yd;
4128
9
                       Newx(xd, nx, double);
4129
9
                       Newx(yd, ny, double);
4130
4131
90
                       for (size_t i = 0; i < nx; i++) {
4132
81
                           SV**restrict tv = av_fetch(x_av, i, 0);
4133
81
                           xd[i] = (tv && SvOK(*tv) && looks_like_number(*tv)) ? SvNV(*tv) : NAN;
4134                       }
4135
90
                       for (size_t i = 0; i < ny; i++) {
4136
81
                           SV**restrict tv = av_fetch(y_av, i, 0);
4137
81
                           yd[i] = (tv && SvOK(*tv) && looks_like_number(*tv)) ? SvNV(*tv) : NAN;
4138                       }
4139
4140
9
                       double r = compute_cor(xd, yd, nx, method);
4141
9
                       Safefree(xd); Safefree(yd);
4142
9
                       RETVAL = newSVnv(r);
4143                   }
4144          } else {//Branch 2: x is a matrix (or y is a matrix)  â†’  AoA result
4145                   // -- resolve x matrix dimensions
4146
3
                   if (!x_is_matrix)
4147
0
                       croak("cor: x must be a matrix (array ref of array refs) "
4148                             "when y is a matrix");
4149
4150
3
                   SV**restrict xr0 = av_fetch(x_av, 0, 0);
4151
3
                   if (!xr0 || !SvROK(*xr0) || SvTYPE(SvRV(*xr0)) != SVt_PVAV)
4152
0
                       croak("cor: each row of x must be an ARRAY reference");
4153
4154
3
                   size_t ncols_x = av_len((AV*)SvRV(*xr0)) + 1;
4155
3
                   if (ncols_x == 0) croak("cor: x matrix has zero columns");
4156
4157
3
                   size_t nrows   = nx;   /* observations */
4158
4159                   // PRE-VALIDATION PASS: Ensure all rows are arrays to prevent memory leaks on croak
4160
12
                   for (size_t i = 0; i < nrows; i++) {
4161
9
                       SV**restrict rv = av_fetch(x_av, i, 0);
4162
9
                       if (!rv || !SvROK(*rv) || SvTYPE(SvRV(*rv)) != SVt_PVAV)
4163
0
                           croak("cor: x row %lu is not an array ref", i);
4164                   }
4165
4166
3
           if (has_y && y_is_matrix) {
4167
3
                       if (ny != nrows) croak("cor: x and y must have the same number of rows (%lu vs %lu)", nrows, ny);
4168
12
               for (size_t i = 0; i < nrows; i++) {
4169
9
                           SV**restrict rv = av_fetch(y_av, i, 0);
4170
9
                           if (!rv || !SvROK(*rv) || SvTYPE(SvRV(*rv)) != SVt_PVAV)
4171
0
                               croak("cor: y row %lu is not an array ref", i);
4172                       }
4173           }
4174
4175                   // -- extract x columns
4176                   double **restrict col_x;
4177
3
                   Newx(col_x, ncols_x, double*);
4178
4179
9
                   for (size_t j = 0; j < ncols_x; j++) {
4180
6
                       Newx(col_x[j], nrows, double);
4181
24
                       for (size_t i = 0; i < nrows; i++) {
4182
18
                           SV**restrict rv = av_fetch(x_av, i, 0);
4183
18
                           AV*restrict  row = (AV*)SvRV(*rv);
4184
18
                           SV**restrict cv  = av_fetch(row, j, 0);
4185
18
                           col_x[j][i] = (cv && SvOK(*cv) && looks_like_number(*cv)) ? SvNV(*cv) : NAN;
4186                       }
4187                   }
4188
4189                   // -- resolve y: separate matrix or re-use x (symmetric)
4190                   size_t ncols_y;
4191
3
                   double **restrict col_y   = NULL;
4192
3
                   bool symmetric = 0;
4193
4194                   // 1 = cor(X) — result is symmetric
4195
6
                   if (has_y && y_is_matrix) {
4196                       // cross-correlation: X (nrows × p) vs Y (nrows × q)
4197
3
                       SV**restrict yr0 = av_fetch(y_av, 0, 0);
4198
3
                       ncols_y = av_len((AV*)SvRV(*yr0)) + 1;
4199
3
                       if (ncols_y == 0) croak("cor: y matrix has zero columns");
4200
4201
3
                       Newx(col_y, ncols_y, double*);
4202
9
                       for (size_t j = 0; j < ncols_y; j++) {
4203
6
                           Newx(col_y[j], nrows, double);
4204
24
                           for (size_t i = 0; i < nrows; i++) {
4205
18
                               SV**restrict  rv = av_fetch(y_av, i, 0);
4206
18
                               AV*restrict  row = (AV*)SvRV(*rv);
4207
18
                               SV**restrict cv  = av_fetch(row, j, 0);
4208
18
                               col_y[j][i] = (cv && SvOK(*cv) && looks_like_number(*cv)) ? SvNV(*cv) : NAN;
4209                           }
4210                       }
4211                   } else { // cor(X) — symmetric p×p result; share column arrays
4212
0
                       ncols_y  = ncols_x;
4213
0
                       col_y    = col_x;
4214
0
                       symmetric = 1;
4215                   }
4216
3
                   if (nrows < 2)
4217
0
                       croak("cor: need at least 2 observations (got %lu)", nrows);
4218                   // -- build cache for symmetric case: compute upper triangle, store results, mirror to lower triangle
4219
3
                   AV*restrict result_av = newAV();
4220
3
                   av_extend(result_av, ncols_x - 1);
4221                   // Allocate per-row AVs up front so we can fill them in order
4222                   AV **restrict rows_out;
4223
3
                   Newx(rows_out, ncols_x, AV*);
4224
9
                   for (size_t i = 0; i < ncols_x; i++) {
4225
6
                       rows_out[i] = newAV();
4226
6
                       av_extend(rows_out[i], ncols_y - 1);
4227                   }
4228
3
                   if (symmetric) {
4229                       /* Upper triangle + diagonal, then mirror. r_cache[i][j] (j >= i) holds the computed value. */
4230                       double **restrict r_cache;
4231
0
                       Newx(r_cache, ncols_x, double*);
4232
0
                       for (size_t i = 0; i < ncols_x; i++)
4233
0
                           Newx(r_cache[i], ncols_x, double);
4234
4235
0
                       for (size_t i = 0; i < ncols_x; i++) {
4236
0
                           r_cache[i][i] = 1.0; // diagonal
4237
0
                           for (size_t j = i + 1; j < ncols_x; j++) {
4238
0
                               double r = compute_cor(col_x[i], col_x[j], nrows, method);
4239
0
                               r_cache[i][j] = r;
4240
0
                               r_cache[j][i] = r; // symmetry
4241                           }
4242                       }
4243                       // fill output AoA from cache
4244
0
                       for (size_t i = 0; i < ncols_x; i++)
4245
0
                           for (size_t j = 0; j < ncols_x; j++)
4246
0
                               av_store(rows_out[i], j, newSVnv(r_cache[i][j]));
4247
4248
0
                       for (size_t i = 0; i < ncols_x; i++) Safefree(r_cache[i]);
4249
0
                       Safefree(r_cache); r_cache = NULL;
4250                   } else {
4251                       // cross-correlation: every (i,j) pair is independent
4252
9
                       for (size_t i = 0; i < ncols_x; i++)
4253
18
                           for (size_t j = 0; j < ncols_y; j++)
4254
12
                               av_store(rows_out[i], j, newSVnv(compute_cor(col_x[i], col_y[j], nrows, method)));
4255                   }
4256                   // push row AVs into result
4257
9
                   for (size_t i = 0; i < ncols_x; i++)
4258
6
                       av_store(result_av, i, newRV_noinc((SV*)rows_out[i]));
4259
3
                   Safefree(rows_out); rows_out = NULL;
4260                   // -- free column arrays -------------------------------------
4261
9
                   for (size_t j = 0; j < ncols_x; j++) Safefree(col_x[j]);
4262
3
                   Safefree(col_x); col_x = NULL;
4263
3
                   if (!symmetric) {
4264
9
                       for (size_t j = 0; j < ncols_y; j++) Safefree(col_y[j]);
4265
3
                       Safefree(col_y);
4266                   }
4267
3
                   RETVAL = newRV_noinc((SV*)result_av);
4268          }
4269        OUTPUT:
4270                RETVAL
4271
4272void scale(...)
4273        PROTOTYPE: @
4274        PPCODE:
4275        {
4276
15
                bool do_center_mean = true, do_scale_sd = true;
4277
15
                double center_val = 0.0, scale_val = 1.0;
4278
15
                size_t data_items = items;
4279                // 1. Parse Options Hash (if it exists as the last argument)
4280
15
                if (items > 0) {
4281
15
                        SV*restrict last_arg = ST(items - 1);
4282
15
                        if (SvROK(last_arg) && SvTYPE(SvRV(last_arg)) == SVt_PVHV) {
4283
6
                                 data_items = items - 1; // Exclude hash from data processing
4284
6
                                 HV*restrict opt_hv = (HV*)SvRV(last_arg);
4285                                 // --- Parse 'center'
4286
6
                                 SV**restrict center_sv = hv_fetch(opt_hv, "center", 6, 0);
4287
6
                                 if (center_sv) {
4288
6
                                     SV*restrict val_sv = *center_sv;
4289
6
                                     if (!SvOK(val_sv)) {
4290
0
                                         do_center_mean = false; center_val = 0.0;
4291                                     } else {
4292
6
                                         char *restrict str = SvPV_nolen(val_sv);
4293                                         /* Trap booleans and empty strings before numeric checks */
4294
6
                                         if (strcasecmp(str, "mean") == 0 || strcasecmp(str, "true") == 0 || strcmp(str, "1") == 0) {
4295
3
                                             do_center_mean = true;
4296
3
                                         } else if (strcasecmp(str, "none") == 0 || strcasecmp(str, "false") == 0 || strcmp(str, "0") == 0 || strcmp(str, "") == 0) {
4297
3
                                             do_center_mean = false; center_val = 0.0;
4298
0
                                         } else if (looks_like_number(val_sv)) {
4299
0
                                             do_center_mean = false; center_val = SvNV(val_sv);
4300
0
                                         } else if (SvTRUE(val_sv)) {
4301
0
                                             do_center_mean = true;
4302                                         } else {
4303
0
                                             do_center_mean = false; center_val = 0.0;
4304                                         }
4305                                     }
4306                                 }
4307                                 // --- Parse 'scale' ---
4308
6
                                 SV**restrict scale_sv = hv_fetch(opt_hv, "scale", 5, 0);
4309
6
                                 if (scale_sv) {
4310
3
                                     SV*restrict val_sv = *scale_sv;
4311
3
                                     if (!SvOK(val_sv)) {
4312
0
                                         do_scale_sd = false; scale_val = 1.0;
4313                                     } else {
4314
3
                                         char *restrict str = SvPV_nolen(val_sv);
4315
3
                                         if (strcasecmp(str, "sd") == 0 || strcasecmp(str, "true") == 0 || strcmp(str, "1") == 0) {
4316
0
                                             do_scale_sd = true;
4317
3
                                         } else if (strcasecmp(str, "none") == 0 || strcasecmp(str, "false") == 0 || strcmp(str, "0") == 0 || strcmp(str, "") == 0) {
4318
3
                                             do_scale_sd = false; scale_val = 1.0;
4319
0
                                         } else if (looks_like_number(val_sv)) {
4320
0
                                             do_scale_sd = false; scale_val = SvNV(val_sv);
4321
0
                                             if (scale_val == 0.0) scale_val = 1.0; /* Prevent Division By Zero */
4322
0
                                         } else if (SvTRUE(val_sv)) {
4323
0
                                             do_scale_sd = true;
4324                                         } else {
4325
0
                                             do_scale_sd = false; scale_val = 1.0;
4326                                         }
4327                                     }
4328                                 }
4329                        }
4330                }
4331                // 2. Detect if the input is a Matrix (Array of Arrays)
4332
15
                bool is_matrix = false;
4333
15
                if (data_items == 1) {
4334
6
                        SV*restrict first_arg = ST(0);
4335
6
                        if (SvROK(first_arg) && SvTYPE(SvRV(first_arg)) == SVt_PVAV) {
4336
3
                                 AV*restrict av = (AV*)SvRV(first_arg);
4337
3
                                 if (av_len(av) >= 0) {
4338
3
                                     SV**restrict first_elem = av_fetch(av, 0, 0);
4339
3
                                     if (first_elem && SvROK(*first_elem) && SvTYPE(SvRV(*first_elem)) == SVt_PVAV) {
4340
3
                                         is_matrix = true;
4341                                     }
4342                                 }
4343                        }
4344                }
4345
15
                if (is_matrix) {
4346                        //=========================================================
4347                        // MATRIX MODE: Scale columns independently (Just like R)
4348                        //=========================================================
4349
3
                        AV*restrict mat_av = (AV*)SvRV(ST(0));
4350
3
                        size_t nrow = av_len(mat_av) + 1, ncol = 0;
4351
4352
3
                        SV**restrict first_row = av_fetch(mat_av, 0, 0);
4353
3
                        ncol = av_len((AV*)SvRV(*first_row)) + 1;
4354
4355
3
                        if (nrow == 0 || ncol == 0) croak("scale requires non-empty matrix");
4356
4357                        // Create a new matrix for the scaled output
4358
3
                        AV*restrict result_av = newAV();
4359
3
                        av_extend(result_av, nrow - 1);
4360
3
                        AV**restrict row_ptrs = (AV**)safemalloc(nrow * sizeof(AV*));
4361
12
                        for (size_t r = 0; r < nrow; r++) {
4362
9
                                 row_ptrs[r] = newAV();
4363
9
                                 av_extend(row_ptrs[r], ncol - 1);
4364
9
                                 av_push(result_av, newRV_noinc((SV*)row_ptrs[r]));
4365                        }
4366                        // Calculate and apply scale per column
4367
9
                        for (size_t c = 0; c < ncol; c++) {
4368
6
                                 double col_sum = 0.0;
4369                                 double *restrict col_data;
4370
6
                                 Newx(col_data, nrow, double);
4371                                 // Extract the column data
4372
24
                                 for (size_t r = 0; r < nrow; r++) {
4373
18
                                     SV**restrict row_sv = av_fetch(mat_av, r, 0);
4374
18
                                     if (row_sv && SvROK(*row_sv)) {
4375
18
                                         AV*restrict row_av = (AV*)SvRV(*row_sv);
4376
18
                                         SV**restrict cell_sv = av_fetch(row_av, c, 0);
4377
18
                                         col_data[r] = (cell_sv && SvOK(*cell_sv)) ? SvNV(*cell_sv) : 0.0;
4378                                     } else {
4379
0
                                         col_data[r] = 0.0;
4380                                     }
4381
18
                                     col_sum += col_data[r];
4382                                 }
4383
4384
6
                                 double col_center = do_center_mean ? (col_sum / nrow) : center_val;
4385
6
                                 double col_scale = scale_val;
4386                                 // Calculate Standard Deviation for this specific column if needed
4387
6
                                 if (do_scale_sd) {
4388
6
                                     if (nrow <= 1) {
4389
0
                                         Safefree(col_data);
4390
0
                                         safefree(row_ptrs);
4391
0
                                         croak("scale needs >= 2 rows to calculate standard deviation for a matrix column");
4392                                     }
4393
6
                                     double sum_sq = 0.0;
4394
24
                                     for (size_t r = 0; r < nrow; r++) {
4395
18
                                         double diff = col_data[r] - col_center;
4396
18
                                         sum_sq += diff * diff;
4397                                     }
4398
6
                                     col_scale = sqrt(sum_sq / (nrow - 1));
4399                                 }
4400                                 // Store scaled values back into the new matrix rows
4401
24
                                 for (size_t r = 0; r < nrow; r++) {
4402
18
                                     double centered = col_data[r] - col_center;
4403
18
                                     double final_val = (col_scale == 0.0) ? (0.0 / 0.0) : (centered / col_scale);
4404
18
                                     av_store(row_ptrs[r], c, newSVnv(final_val));
4405                                 }
4406
6
                                 Safefree(col_data);
4407                        }
4408
3
                        safefree(row_ptrs);
4409                        // Push the resulting matrix as a single Reference onto the Perl stack
4410
3
                        EXTEND(SP, 1);
4411
3
                        PUSHs(sv_2mortal(newRV_noinc((SV*)result_av)));
4412                } else {
4413                        // ======================================
4414                        // FLAT LIST MODE: Original functionality
4415                        // ======================================
4416
12
                        size_t total_count = 0, k = 0;
4417                        double *restrict nums;
4418
12
                        double sum = 0.0;
4419
60
                        for (size_t i = 0; i < data_items; i++) {
4420
48
                                SV*restrict arg = ST(i);
4421
48
                                if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4422
0
                                  AV*restrict av = (AV*)SvRV(arg);
4423
0
                                  size_t len = av_len(av) + 1;
4424
0
                                  for (unsigned int j = 0; j < len; j++) {
4425
0
                                                SV**restrict tv = av_fetch(av, j, 0);
4426
0
                                                if (tv && SvOK(*tv)) { total_count++; }
4427                                  }
4428
48
                                } else if (SvOK(arg)) {
4429
48
                                  total_count++;
4430                                }
4431                        }
4432
12
                        if (total_count == 0) croak("scale requires at least 1 numeric element");
4433
12
                        Newx(nums, total_count, double);
4434
60
                        for (size_t i = 0; i < data_items; i++) {
4435
48
                                 SV*restrict arg = ST(i);
4436
48
                                 if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4437
0
                                     AV*restrict av = (AV*)SvRV(arg);
4438
0
                                     size_t len = av_len(av) + 1;
4439
0
                                     for (size_t j = 0; j < len; j++) {
4440
0
                                         SV**restrict tv = av_fetch(av, j, 0);
4441
0
                                         if (tv && SvOK(*tv)) {
4442
0
                                             double val = SvNV(*tv);
4443
0
                                             nums[k++] = val; sum += val;
4444                                         }
4445                                     }
4446
48
                                 } else if (SvOK(arg)) {
4447
48
                                     double val = SvNV(arg);
4448
48
                                     nums[k++] = val; sum += val;
4449                                 }
4450                        }
4451
4452
12
                        if (do_center_mean) center_val = sum / total_count;
4453
4454
12
                        if (do_scale_sd) {
4455
9
                                 if (total_count <= 1) {
4456
3
                                     Safefree(nums);
4457
3
                                     croak("scale needs >= 2 elements to calculate SD");
4458                                 }
4459
6
                                 double sum_sq = 0.0;
4460
36
                                 for (size_t i = 0; i < total_count; i++) {
4461
30
                                     double diff = nums[i] - center_val;
4462
30
                                     sum_sq += diff * diff;
4463                                 }
4464
6
                                 scale_val = sqrt(sum_sq / (total_count - 1));
4465                        }
4466
9
                        EXTEND(SP, total_count);
4467
54
                        for (size_t i = 0; i < total_count; i++) {
4468
45
                                double centered = nums[i] - center_val;
4469
45
                                double final_val = (scale_val == 0.0) ? (0.0 / 0.0) : (centered / scale_val);
4470
45
                                PUSHs(sv_2mortal(newSVnv(final_val)));
4471                        }
4472
9
                        Safefree(nums); nums = NULL;
4473                }
4474        }
4475
4476SV* matrix(...)
4477CODE:
4478        // Basic check: must have an even number of arguments for key => value
4479
12
        if (items % 2 != 0) {
4480
0
          croak("Usage: matrix(data => [...], nrow => $n, ncol => $m, byrow => $bool)");
4481        }
4482
12
        SV*restrict data_sv = NULL;
4483
12
        size_t nrow = 0, ncol = 0;
4484
12
        bool byrow = FALSE, nrow_set = FALSE, ncol_set = FALSE;
4485        // Parse named arguments
4486
36
        for (size_t i = 0; i < items; i += 2) {
4487
24
          char*restrict key = SvPV_nolen(ST(i));
4488
24
          SV*restrict val   = ST(i + 1);
4489
24
          if (strEQ(key, "data")) {
4490
12
                   data_sv = val;
4491
12
          } else if (strEQ(key, "nrow")) {
4492
9
                   nrow = (size_t)SvUV(val);
4493
9
                   nrow_set = TRUE;
4494
3
          } else if (strEQ(key, "ncol")) {
4495
3
                   ncol = (size_t)SvUV(val);
4496
3
                   ncol_set = TRUE;
4497
0
          } else if (strEQ(key, "byrow")) {
4498
0
                   byrow = SvTRUE(val);
4499          } else {
4500
0
                   croak("Unknown option: %s", key);
4501          }
4502        }
4503        // Validate data input
4504
12
        if (!data_sv || !SvROK(data_sv) || SvTYPE(SvRV(data_sv)) != SVt_PVAV) {
4505
3
          croak("The 'data' option must be an array reference (e.g. data => [1..6])");
4506        }
4507
9
        AV*restrict data_av = (AV*)SvRV(data_sv);
4508
9
        size_t data_len = (UV)(av_top_index(data_av) + 1);
4509
9
        if (data_len == 0) {
4510
3
          croak("Data array cannot be empty");
4511        }
4512        // R-style dimension inference
4513
6
        if (!nrow_set && !ncol_set) {
4514
0
          nrow = data_len;
4515
0
          ncol = 1;
4516
6
        } else if (nrow_set && !ncol_set) {
4517
3
          ncol = (data_len + nrow - 1) / nrow;
4518
3
        } else if (!nrow_set && ncol_set) {
4519
0
          nrow = (data_len + ncol - 1) / ncol;
4520        }
4521        // Final safety check for dimensions
4522
6
        if (nrow == 0 || ncol == 0) {
4523
3
          croak("Dimensions must be greater than 0");
4524        }
4525        // Create the matrix (Array of Arrays)
4526
3
        AV*restrict result_av = newAV();
4527
3
        av_extend(result_av, nrow - 1);
4528        size_t r, c;// Use unsigned types for counters to prevent negative indexing
4529
3
        AV**restrict row_ptrs = (AV**restrict)safemalloc(nrow * sizeof(AV*)); /* Pre-allocate row pointers */
4530
9
        for (r = 0; r < nrow; r++) {
4531
6
          row_ptrs[r] = newAV();
4532
6
          av_extend(row_ptrs[r], ncol - 1);
4533
6
          av_push(result_av, newRV_noinc((SV*)row_ptrs[r]));
4534        }
4535        // Fill the matrix
4536
3
        size_t total_cells = nrow * ncol;
4537
21
        for (size_t i = 0; i < total_cells; i++) {
4538          // Vector recycling logic
4539
18
          SV**restrict fetched = av_fetch(data_av, i % data_len, 0);
4540
18
          SV*restrict val = fetched ? newSVsv(*fetched) : newSV(0);
4541
18
          if (byrow) {
4542
0
                   r = i / ncol;
4543
0
                   c = i % ncol;
4544          } else {
4545
18
                   r = i % nrow;
4546
18
                   c = i / nrow;
4547          }
4548
18
          av_store(row_ptrs[r], c, val);
4549        }
4550
3
        safefree(row_ptrs);
4551
3
        RETVAL = newRV_noinc((SV*)result_av);
4552        OUTPUT:
4553        RETVAL
4554
4555SV* lm(...)
4556CODE:
4557{
4558
66
        const char *restrict formula  = NULL;
4559
66
        SV *restrict data_sv = NULL;
4560        char f_cpy[512];
4561        char *restrict src, *restrict dst, *restrict tilde, *restrict lhs, *restrict rhs, *restrict chunk;
4562
4563
66
        char **restrict terms = NULL, **restrict uniq_terms = NULL, **restrict exp_terms = NULL;
4564
66
        bool *restrict is_dummy = NULL;
4565
66
        char **restrict dummy_base = NULL, **restrict dummy_level = NULL;
4566
66
        unsigned int term_cap = 64, exp_cap = 64, num_terms = 0, num_uniq = 0, p = 0, p_exp = 0;
4567
66
        size_t n = 0, valid_n = 0, i, j, k, l, l1, l2;
4568
66
        bool has_intercept = true;
4569
4570
66
        char **restrict row_names = NULL, **restrict valid_row_names = NULL;
4571
66
        HV **restrict row_hashes = NULL;
4572
66
        HV *restrict data_hoa = NULL;
4573
66
        SV *restrict ref = NULL;
4574
4575
66
        double *restrict X = NULL, *restrict Y = NULL, *restrict XtX = NULL, *restrict XtY = NULL;
4576
66
        bool *restrict aliased = NULL;
4577
66
        double *restrict beta = NULL;
4578
66
        int final_rank = 0, df_res = 0;
4579        HV *restrict res_hv, *restrict coef_hv, *restrict fitted_hv, *restrict resid_hv, *restrict summary_hv;
4580        AV *restrict terms_av;
4581
66
        double rss = 0.0, rse_sq = 0.0;
4582        HE *restrict entry;
4583
4584
66
        if (items % 2 != 0) croak("Usage: lm(formula => 'mpg ~ wt * hp', data => \\%%mtcars)");
4585
4586
192
        for (unsigned short i_arg = 0; i_arg < items; i_arg += 2) {
4587
126
          const char *restrict key = SvPV_nolen(ST(i_arg));
4588
126
          SV *restrict val = ST(i_arg + 1);
4589
126
          if      (strEQ(key, "formula")) formula = SvPV_nolen(val);
4590
63
          else if (strEQ(key, "data"))    data_sv = val;
4591
0
          else croak("lm: unknown argument '%s'", key);
4592        }
4593
66
        if (!formula) croak("lm: formula is required");
4594
63
        if (!data_sv || !SvROK(data_sv)) croak("lm: data is required and must be a reference");
4595
4596        // ========================================================================
4597        // PHASE 1: Data Extraction
4598        // ========================================================================
4599
57
        ref = SvRV(data_sv);
4600
57
        if (SvTYPE(ref) == SVt_PVHV) {
4601
57
          HV *restrict hv = (HV*)ref;
4602
57
          if (hv_iterinit(hv) == 0) croak("lm: Data hash is empty");
4603
57
          entry = hv_iternext(hv);
4604
57
          if (entry) {
4605
57
                   SV *restrict val = hv_iterval(hv, entry);
4606
57
                   if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) {
4607
36
                       data_hoa = hv;
4608
36
                       n = av_len((AV*)SvRV(val)) + 1;
4609
36
                       Newx(row_names, n, char*);
4610
246
                       for (i = 0; i < n; i++) {
4611                           char buf[32];
4612
210
                           snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i + 1));
4613
210
                           row_names[i] = savepv(buf);
4614                       }
4615
21
                   } else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) {
4616
21
                       n = hv_iterinit(hv);
4617
21
                       Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
4618
21
                       i = 0;
4619
693
                       while ((entry = hv_iternext(hv))) {
4620                           unsigned int len;
4621
672
                           row_names[i] = savepv(hv_iterkey(entry, &len));
4622
672
                           row_hashes[i] = (HV*)SvRV(hv_iterval(hv, entry));
4623
672
                           i++;
4624                       }
4625
0
                   } else croak("lm: Hash values must be ArrayRefs (HoA) or HashRefs (HoH)");
4626          }
4627
0
        } else if (SvTYPE(ref) == SVt_PVAV) {
4628
0
          AV *restrict av = (AV*)ref; n = av_len(av) + 1;
4629
0
          Newx(row_names, n, char*);
4630
0
          Newx(row_hashes, n, HV*);
4631
0
          for (i = 0; i < n; i++) {
4632
0
                   SV **restrict val = av_fetch(av, i, 0);
4633
0
                   if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVHV) {
4634
0
                       row_hashes[i] = (HV*)SvRV(*val);
4635
0
                       char buf[32]; snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i + 1));
4636
0
                       row_names[i] = savepv(buf);
4637                   } else {
4638
0
                       for (k = 0; k < i; k++) Safefree(row_names[k]);
4639
0
                       Safefree(row_names); Safefree(row_hashes);
4640
0
                       croak("lm: Array values must be HashRefs (AoH)");
4641                   }
4642          }
4643
0
        } else croak("lm: Data must be an Array or Hash reference");
4644
4645        // ========================================================================
4646        // PHASE 2: Formula Parsing & `.` Expansion
4647        // ========================================================================
4648
57
        src = (char*)formula; dst = f_cpy;
4649
645
        while (*src && (dst - f_cpy < 511)) { if (!isspace(*src)) { *dst++ = *src; } src++; }
4650
57
        *dst = '\0';
4651
4652
57
        tilde = strchr(f_cpy, '~');
4653
57
        if (!tilde) {
4654
9
          for (i = 0; i < n; i++) Safefree(row_names[i]);
4655
3
          Safefree(row_names); if (row_hashes) Safefree(row_hashes);
4656
3
          croak("lm: invalid formula, missing '~'");
4657        }
4658
54
        *tilde = '\0';
4659
54
        lhs = f_cpy;
4660
54
        rhs = tilde + 1;
4661
4662        // Remove intercept-suppression markers from RHS.
4663        // IMPORTANT: skip tokens that appear inside I(...) wrappers so that
4664        // expressions like I(x^-1) are never mistakenly treated as "-1".
4665        {
4666
54
          char *restrict p_idx = rhs;
4667
267
          while (*p_idx) {
4668                   // Skip over I(...) sub-expressions entirely
4669
213
                   if (p_idx[0] == 'I' && p_idx[1] == '(') {
4670
0
                       int depth = 0;
4671
0
                       while (*p_idx) { if (*p_idx == '(') depth++; else if (*p_idx == ')') { depth--; if (depth == 0) { p_idx++; break; } } p_idx++; }
4672
0
                       continue;
4673                   }
4674                   // Match bare -1
4675
213
                   if (p_idx[0] == '-' && p_idx[1] == '1' &&
4676
3
                       (p_idx[2] == '\0' || p_idx[2] == '+' || p_idx[2] == '-')) {
4677
3
                       has_intercept = false;
4678
3
                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
4679
3
                       continue; // re-examine same position
4680                   }
4681                   // Match +0
4682
210
                   if (p_idx[0] == '+' && p_idx[1] == '0' &&
4683
0
                       (p_idx[2] == '\0' || p_idx[2] == '+' || p_idx[2] == '-')) {
4684
0
                       has_intercept = false;
4685
0
                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
4686
0
                       continue;
4687                   }
4688                   // Match leading 0+
4689
210
                   if (p_idx == rhs && p_idx[0] == '0' && p_idx[1] == '+') {
4690
0
                       has_intercept = false;
4691
0
                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
4692
0
                       continue;
4693                   }
4694                   // Match bare 0 (entire rhs)
4695
210
                   if (p_idx == rhs && p_idx[0] == '0' && p_idx[1] == '\0') {
4696
0
                       has_intercept = false; p_idx[0] = '\0'; break;
4697                   }
4698                   // Strip redundant +1 (keep intercept, just remove marker)
4699
210
                   if (p_idx[0] == '+' && p_idx[1] == '1' &&
4700
0
                       (p_idx[2] == '\0' || p_idx[2] == '+' || p_idx[2] == '-')) {
4701
0
                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
4702
0
                       continue;
4703                   }
4704                   // Strip leading bare 1 or 1+
4705
210
                   if (p_idx == rhs) {
4706
54
                       if (p_idx[0] == '1' && p_idx[1] == '\0') { p_idx[0] = '\0'; break; }
4707
54
                       if (p_idx[0] == '1' && p_idx[1] == '+') { memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); continue; }
4708                   }
4709
210
                   p_idx++;
4710          }
4711        }
4712
4713        // Clean up stray `++`, leading `+`, trailing `+`
4714        {
4715          char *restrict p_idx;
4716
54
          while ((p_idx = strstr(rhs, "++")) != NULL)
4717
0
                   memmove(p_idx, p_idx + 1, strlen(p_idx + 1) + 1);
4718
54
          if (rhs[0] == '+') memmove(rhs, rhs + 1, strlen(rhs + 1) + 1);
4719
54
          size_t len_rhs = strlen(rhs);
4720
54
          if (len_rhs > 0 && rhs[len_rhs - 1] == '+') rhs[len_rhs - 1] = '\0';
4721        }
4722
4723        // Expand `.` Operator
4724
54
        char rhs_expanded[2048] = "";
4725
54
        size_t rhs_len = 0;
4726
54
        chunk = strtok(rhs, "+");
4727
132
        while (chunk != NULL) {
4728
78
          if (strcmp(chunk, ".") == 0) {
4729
3
                   AV *cols = get_all_columns(data_hoa, row_hashes, n);
4730
12
                   for (size_t c = 0; c <= (size_t)av_len(cols); c++) {
4731
9
                       SV **col_sv = av_fetch(cols, c, 0);
4732
9
                       if (col_sv && SvOK(*col_sv)) {
4733
9
                           const char *col_name = SvPV_nolen(*col_sv);
4734
9
                           if (strcmp(col_name, lhs) != 0) {
4735
6
                               size_t slen = strlen(col_name);
4736
6
                               if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
4737
6
                                   if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
4738
6
                                   strcat(rhs_expanded, col_name);
4739
6
                                   rhs_len += slen;
4740                               }
4741                           }
4742                       }
4743                   }
4744
3
                   SvREFCNT_dec(cols);
4745          } else {
4746
75
                   size_t slen = strlen(chunk);
4747
75
                   if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
4748
75
                       if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
4749
75
                       strcat(rhs_expanded, chunk);
4750
75
                       rhs_len += slen;
4751                   }
4752          }
4753
78
          chunk = strtok(NULL, "+");
4754        }
4755
4756
54
        Newx(terms, term_cap, char*); Newx(uniq_terms, term_cap, char*);
4757
54
        Newx(exp_terms, exp_cap, char*); Newx(is_dummy, exp_cap, bool);
4758
54
        Newx(dummy_base, exp_cap, char*); Newx(dummy_level, exp_cap, char*);
4759
4760
54
        if (has_intercept) { terms[num_terms++] = savepv("Intercept"); }
4761
4762
54
        if (strlen(rhs_expanded) > 0) {
4763
54
          chunk = strtok(rhs_expanded, "+");
4764
135
          while (chunk != NULL) {
4765
81
                   if (num_terms >= term_cap - 3) {
4766
0
                       term_cap *= 2;
4767
0
                       Renew(terms, term_cap, char*); Renew(uniq_terms, term_cap, char*);
4768                   }
4769
81
                   char *restrict star = strchr(chunk, '*');
4770
81
                   if (star) {
4771
3
                       *star = '\0';
4772
3
                       char *restrict left = chunk;
4773
3
                       char *restrict right = star + 1;
4774
3
                       char *restrict c_l = strchr(left, '^');
4775
3
                       if (c_l && strncmp(left, "I(", 2) != 0) *c_l = '\0';
4776
3
                       char *restrict c_r = strchr(right, '^');
4777
3
                       if (c_r && strncmp(right, "I(", 2) != 0) *c_r = '\0';
4778
3
                       terms[num_terms++] = savepv(left);
4779
3
                       terms[num_terms++] = savepv(right);
4780
3
                       size_t inter_len = strlen(left) + strlen(right) + 2;
4781
3
                       terms[num_terms] = (char*)safemalloc(inter_len);
4782
3
                       snprintf(terms[num_terms++], inter_len, "%s:%s", left, right);
4783                   } else {
4784
78
                       char *restrict c_chunk = strchr(chunk, '^');
4785
78
                       if (c_chunk && strncmp(chunk, "I(", 2) != 0) *c_chunk = '\0';
4786
78
                       terms[num_terms++] = savepv(chunk);
4787                   }
4788
81
                   chunk = strtok(NULL, "+");
4789          }
4790        }
4791
4792
192
        for (i = 0; i < num_terms; i++) {
4793
138
          bool found = false;
4794
258
          for (j = 0; j < num_uniq; j++) { if (strcmp(terms[i], uniq_terms[j]) == 0) { found = true; break; } }
4795
138
          if (!found) uniq_terms[num_uniq++] = savepv(terms[i]);
4796        }
4797
54
        p = num_uniq;
4798
4799        // ========================================================================
4800        // PHASE 3: Categorical Expansion
4801        // ========================================================================
4802
192
        for (j = 0; j < p; j++) {
4803
138
          if (p_exp + 32 >= exp_cap) {
4804
0
                   exp_cap *= 2;
4805
0
                   Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
4806
0
                   Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
4807          }
4808
138
          if (strcmp(uniq_terms[j], "Intercept") == 0) {
4809
51
                   exp_terms[p_exp] = savepv("Intercept"); is_dummy[p_exp] = false; p_exp++; continue;
4810          }
4811
87
          if (is_column_categorical(data_hoa, row_hashes, n, uniq_terms[j])) {
4812
15
                   char **restrict levels = NULL;
4813
15
                   unsigned int num_levels = 0, levels_cap = 8;
4814
15
                   Newx(levels, levels_cap, char*);
4815
141
                   for (i = 0; i < n; i++) {
4816
126
                       char *str_val = get_data_string_alloc(data_hoa, row_hashes, i, uniq_terms[j]);
4817
126
                       if (str_val) {
4818
126
                           bool found = false;
4819
243
                           for (l = 0; l < num_levels; l++) { if (strcmp(levels[l], str_val) == 0) { found = true; break; } }
4820
126
                           if (!found) {
4821
42
                               if (num_levels >= levels_cap) { levels_cap *= 2; Renew(levels, levels_cap, char*); }
4822
42
                               levels[num_levels++] = savepv(str_val);
4823                           }
4824
126
                           Safefree(str_val);
4825                       }
4826                   }
4827
15
                   if (num_levels > 0) {
4828
42
                       for (l1 = 0; l1 < num_levels - 1; l1++)
4829
66
                           for (l2 = l1 + 1; l2 < num_levels; l2++)
4830
39
                               if (strcmp(levels[l1], levels[l2]) > 0) { char *tmp = levels[l1]; levels[l1] = levels[l2]; levels[l2] = tmp; }
4831
42
                       for (l = 1; l < num_levels; l++) {
4832
27
                           if (p_exp >= exp_cap) {
4833
0
                               exp_cap *= 2;
4834
0
                               Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
4835
0
                               Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
4836                           }
4837
27
                           size_t t_len = strlen(uniq_terms[j]) + strlen(levels[l]) + 1;
4838
27
                           exp_terms[p_exp] = (char*)safemalloc(t_len);
4839
27
                           snprintf(exp_terms[p_exp], t_len, "%s%s", uniq_terms[j], levels[l]);
4840
27
                           is_dummy[p_exp] = true;
4841
27
                           dummy_base[p_exp]  = savepv(uniq_terms[j]);
4842
27
                           dummy_level[p_exp] = savepv(levels[l]);
4843
27
                           p_exp++;
4844                       }
4845
57
                       for (l = 0; l < num_levels; l++) Safefree(levels[l]);
4846
15
                       Safefree(levels);
4847                   } else {
4848
0
                       Safefree(levels);
4849
0
                       exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = false; p_exp++;
4850                   }
4851          } else {
4852
72
                   exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = false; p_exp++;
4853          }
4854        }
4855
54
        p = p_exp;
4856
54
        Newx(X, n * p, double); Newx(Y, n, double);
4857
54
        Newx(valid_row_names, n, char*);
4858
4859        // ========================================================================
4860        // PHASE 4: Matrix Construction & Listwise Deletion
4861        // ========================================================================
4862
930
        for (i = 0; i < n; i++) {
4863
876
          double y_val = evaluate_term(data_hoa, row_hashes, i, lhs);
4864
876
          if (isnan(y_val)) { Safefree(row_names[i]); continue; }
4865
4866
867
          bool row_ok = true;
4867
867
          double *restrict row_x = (double*)safemalloc(p * sizeof(double));
4868
3336
          for (j = 0; j < p; j++) {
4869
2469
                   if (strcmp(exp_terms[j], "Intercept") == 0) {
4870
771
                       row_x[j] = 1.0;
4871
1698
                   } else if (is_dummy[j]) {
4872
234
                       char *restrict str_val = get_data_string_alloc(data_hoa, row_hashes, i, dummy_base[j]);
4873
234
                       if (str_val) {
4874
234
                           row_x[j] = (strcmp(str_val, dummy_level[j]) == 0) ? 1.0 : 0.0;
4875
234
                           Safefree(str_val);
4876
0
                       } else { row_ok = false; break; }
4877                   } else {
4878
1464
                       row_x[j] = evaluate_term(data_hoa, row_hashes, i, exp_terms[j]);
4879
1464
                       if (isnan(row_x[j])) { row_ok = false; break; }
4880                   }
4881          }
4882
867
          if (!row_ok) { Safefree(row_names[i]); Safefree(row_x); continue; }
4883
4884
867
          Y[valid_n] = y_val;
4885
3336
          for (j = 0; j < p; j++) X[valid_n * p + j] = row_x[j];
4886
867
          valid_row_names[valid_n] = row_names[i];
4887
867
          valid_n++;
4888
867
          Safefree(row_x);
4889        }
4890
54
        Safefree(row_names);
4891
4892
54
        if (valid_n <= p) {
4893
21
          for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
4894
21
          for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
4895
21
          for (j = 0; j < p_exp; j++) {
4896
15
                   Safefree(exp_terms[j]);
4897
15
                   if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
4898          }
4899
6
          Safefree(exp_terms); Safefree(is_dummy); Safefree(dummy_base); Safefree(dummy_level);
4900
6
          Safefree(X); Safefree(Y); Safefree(valid_row_names);
4901
6
          if (row_hashes) Safefree(row_hashes);
4902
6
          croak("lm: 0 degrees of freedom (too many NAs or parameters > observations)");
4903        }
4904
4905        // ========================================================================
4906        // PHASE 5: OLS Math
4907        // ========================================================================
4908
48
        Newxz(XtX, p * p, double);
4909
183
        for (i = 0; i < p; i++)
4910
534
          for (j = 0; j < p; j++) {
4911
399
                   double sum = 0.0;
4912
7860
                   for (k = 0; k < valid_n; k++) sum += X[k * p + i] * X[k * p + j];
4913
399
                   XtX[i * p + j] = sum;
4914          }
4915
48
        Newxz(XtY, p, double);
4916
183
        for (i = 0; i < p; i++) {
4917
135
          double sum = 0.0;
4918
2580
          for (k = 0; k < valid_n; k++) sum += X[k * p + i] * Y[k];
4919
135
          XtY[i] = sum;
4920        }
4921
48
        Newx(aliased, p, bool);
4922
48
        final_rank = sweep_matrix_ols(XtX, p, aliased);
4923
48
        Newxz(beta, p, double);
4924
183
        for (i = 0; i < p; i++) {
4925
135
          if (aliased[i]) { beta[i] = NAN; }
4926          else {
4927
132
                   double sum = 0.0;
4928
522
                   for (j = 0; j < p; j++) if (!aliased[j]) sum += XtX[i * p + j] * XtY[j];
4929
132
                   beta[i] = sum;
4930          }
4931        }
4932
4933        // ========================================================================
4934        // PHASE 6: Metrics & Cleanup
4935        // ========================================================================
4936
48
        res_hv = newHV(); coef_hv = newHV(); fitted_hv = newHV(); resid_hv = newHV();
4937
48
        summary_hv = newHV(); terms_av = newAV();
4938
4939
48
        df_res = (int)valid_n - final_rank;
4940
4941        // rss / mss accumulated here — rse_sq computed AFTER this loop (not before)
4942
48
        double sum_y = 0.0, mss = 0.0;
4943
906
        for (i = 0; i < valid_n; i++) sum_y += Y[i];
4944
48
        double mean_y = sum_y / (double)valid_n;
4945
4946
906
        for (i = 0; i < valid_n; i++) {
4947
858
          double y_hat = 0.0;
4948
3303
          for (j = 0; j < p; j++) if (!aliased[j]) y_hat += X[i * p + j] * beta[j];
4949
858
          double res   = Y[i] - y_hat;
4950
858
          rss          += res * res;
4951
858
          double diff_m = has_intercept ? (y_hat - mean_y) : y_hat;
4952
858
          mss          += diff_m * diff_m;
4953
858
          hv_store(fitted_hv, valid_row_names[i], strlen(valid_row_names[i]), newSVnv(y_hat), 0);
4954
858
          hv_store(resid_hv,  valid_row_names[i], strlen(valid_row_names[i]), newSVnv(res),   0);
4955
858
          Safefree(valid_row_names[i]);
4956        }
4957
48
        Safefree(valid_row_names);
4958
4959        // Single, authoritative rse_sq calculation
4960
48
        rse_sq = (df_res > 0) ? (rss / (double)df_res) : NAN;
4961
4962
48
        int df_int = has_intercept ? 1 : 0;
4963
48
        double r_squared = 0.0, adj_r_squared = 0.0, f_stat = NAN, f_pvalue = NAN;
4964
48
        int numdf = final_rank - df_int;
4965
4966
48
        if (final_rank != df_int && (mss + rss) > 0.0) {
4967
48
          r_squared     = mss / (mss + rss);
4968
48
          adj_r_squared = 1.0 - (1.0 - r_squared) * ((valid_n - df_int) / (double)df_res);
4969
48
          if (rse_sq > 0.0 && numdf > 0) {
4970
48
                   f_stat   = (mss / (double)numdf) / rse_sq;
4971
48
                   f_pvalue = 1.0 - pf(f_stat, (double)numdf, (double)df_res);
4972
0
          } else if (rse_sq == 0.0) {
4973
0
                   f_stat   = INFINITY;
4974
0
                   f_pvalue = 0.0;
4975          }
4976
0
        } else if (final_rank == df_int) {
4977
0
          r_squared = 0.0; adj_r_squared = 0.0;
4978        }
4979
4980
183
        for (j = 0; j < p; j++) {
4981
135
          hv_store(coef_hv, exp_terms[j], strlen(exp_terms[j]), newSVnv(beta[j]), 0);
4982
135
          av_push(terms_av, newSVpv(exp_terms[j], 0));
4983
135
          HV *restrict row_hv = newHV();
4984
135
          if (aliased[j]) {
4985
3
                   hv_store(row_hv, "Estimate",   8,  newSVpv("NaN", 0), 0);
4986
3
                   hv_store(row_hv, "Std. Error", 10, newSVpv("NaN", 0), 0);
4987
3
                   hv_store(row_hv, "t value",    7,  newSVpv("NaN", 0), 0);
4988
3
                   hv_store(row_hv, "Pr(>|t|)",   8,  newSVpv("NaN", 0), 0);
4989          } else {
4990
132
                   double se    = sqrt(rse_sq * XtX[j * p + j]);
4991
132
                   double t_val = (se > 0.0) ? (beta[j] / se) : (INFINITY * (beta[j] >= 0.0 ? 1.0 : -1.0));
4992
132
                   double p_val = get_t_pvalue(t_val, df_res, "two.sided");
4993
132
                   hv_store(row_hv, "Estimate",   8,  newSVnv(beta[j]), 0);
4994
132
                   hv_store(row_hv, "Std. Error", 10, newSVnv(se),      0);
4995
132
                   hv_store(row_hv, "t value",    7,  newSVnv(t_val),   0);
4996
132
                   hv_store(row_hv, "Pr(>|t|)",   8,  newSVnv(p_val),   0);
4997          }
4998
135
          hv_store(summary_hv, exp_terms[j], strlen(exp_terms[j]), newRV_noinc((SV*)row_hv), 0);
4999        }
5000
5001
48
        hv_store(res_hv, "coefficients",  12, newRV_noinc((SV*)coef_hv),   0);
5002
48
        hv_store(res_hv, "fitted.values", 13, newRV_noinc((SV*)fitted_hv), 0);
5003
48
        hv_store(res_hv, "residuals",      9, newRV_noinc((SV*)resid_hv),  0);
5004
48
        hv_store(res_hv, "df.residual",   11, newSVuv(df_res),             0);
5005
48
        hv_store(res_hv, "rank",           4, newSVuv(final_rank),         0);
5006
48
        hv_store(res_hv, "rss",            3, newSVnv(rss),                0);
5007
48
        hv_store(res_hv, "summary",        7, newRV_noinc((SV*)summary_hv),0);
5008
48
        hv_store(res_hv, "terms",          5, newRV_noinc((SV*)terms_av),  0);
5009
48
        hv_store(res_hv, "r.squared",      9, newSVnv(r_squared),          0);
5010
48
        hv_store(res_hv, "adj.r.squared", 13, newSVnv(adj_r_squared),      0);
5011
48
        if (!isnan(f_stat)) {
5012
48
          AV *fstat_av = newAV();
5013
48
          av_push(fstat_av, newSVnv(f_stat));
5014
48
          av_push(fstat_av, newSViv(numdf));
5015
48
          av_push(fstat_av, newSViv(df_res));
5016
48
          hv_store(res_hv, "fstatistic", 10, newRV_noinc((SV*)fstat_av), 0);
5017
48
          hv_store(res_hv, "f.pvalue",    8, newSVnv(f_pvalue),          0);
5018        }
5019
5020        // Deep Cleanup
5021
171
        for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
5022
171
        for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
5023
183
        for (j = 0; j < p_exp; j++) {
5024
135
          Safefree(exp_terms[j]);
5025
135
          if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
5026        }
5027
48
        Safefree(exp_terms); Safefree(is_dummy); Safefree(dummy_base); Safefree(dummy_level);
5028
48
        Safefree(X); Safefree(Y); Safefree(XtX); Safefree(XtY);
5029
48
        Safefree(beta); Safefree(aliased);
5030
48
        if (row_hashes) Safefree(row_hashes);
5031
5032
48
        RETVAL = newRV_noinc((SV*)res_hv);
5033}
5034OUTPUT:
5035    RETVAL
5036
5037void seq(from, to, by = 1.0)
5038        double from
5039        double to
5040        double by
5041PPCODE:
5042        {
5043                //Handle the zero 'by' case
5044
18
                if (by == 0.0) {
5045
0
                        if (from == to) {
5046
0
                                 EXTEND(SP, 1);
5047
0
                                 mPUSHn(from);
5048
0
                                 XSRETURN(1);
5049                        } else {
5050
0
                                 croak("invalid 'by' argument: cannot be zero when from != to");
5051                        }
5052                }
5053                // Check for wrong direction / infinite loop
5054
18
                if ((from < to && by < 0.0) || (from > to && by > 0.0)) {
5055
0
                        croak("wrong sign in 'by' argument");
5056                }
5057                /* * Calculate number of elements.
5058                * R uses a small epsilon (like 1e-10) to avoid dropping the last
5059                * element due to floating point inaccuracies.
5060                */
5061
18
                double n_elements_d = (to - from) / by;
5062
18
                if (n_elements_d < 0.0) n_elements_d = 0.0;
5063
18
                size_t n_elements = (n_elements_d + 1e-10) + 1;
5064                // Pre-extend the stack to avoid reallocating inside the loop
5065
18
                EXTEND(SP, n_elements);
5066
9099
                for (size_t i = 0; i < n_elements; i++) {
5067
9081
                        mPUSHn(from + i * by);
5068                }
5069
18
                XSRETURN(n_elements);
5070        }
5071
5072SV* rnorm(...)
5073    CODE:
5074    {
5075        // Auto-seed the PRNG if the Perl script hasn't done so yet
5076
6
        AUTO_SEED_PRNG();
5077
5078
6
        size_t n = 0;
5079
6
        double mean = 0.0, sd = 1.0;
5080
6
        int arg_start = 0;
5081
5082        // Check if the first argument is a simple integer (rnorm(33))
5083
6
        if (items > 0 && SvIOK(ST(0)) && (items == 1 || items % 2 != 0)) {
5084
0
            n = (unsigned int)SvUV(ST(0));
5085
0
            arg_start = 1; // Start parsing named arguments from the second element
5086        }
5087
5088        // --- Parse remaining named arguments from the flat stack ---
5089
6
        if ((items - arg_start) % 2 != 0) {
5090
0
            croak("Usage: rnorm(n), rnorm(n => 10, mean => 0, sd => 1), or rnorm(33, mean => 0)");
5091        }
5092
5093
21
        for (int i = arg_start; i < items; i += 2) {
5094
15
            const char* restrict key = SvPV_nolen(ST(i));
5095
15
            SV* restrict val = ST(i + 1);
5096
5097
15
            if      (strEQ(key, "n"))    n    = (unsigned int)SvUV(val);
5098
9
            else if (strEQ(key, "mean")) mean = SvNV(val);
5099
6
            else if (strEQ(key, "sd"))   sd   = SvNV(val);
5100
0
            else croak("rnorm: unknown argument '%s'", key);
5101        }
5102
5103
6
        if (sd < 0.0) croak("rnorm: standard deviation must be non-negative");
5104
5105
3
        AV *restrict result_av = newAV();
5106
3
        if (n > 0) {
5107
3
            av_extend(result_av, n - 1);
5108            // Generate random normals using the Box-Muller transform
5109
15006
            for (size_t i = 0; i < n; ) {
5110                 double u, v, s;
5111                 do {
5112                     // Drand01() hooks into Perl's internal PRNG, respecting Perl's srand()
5113
19147
                     u = 2.0 * Drand01() - 1.0;
5114
19147
                     v = 2.0 * Drand01() - 1.0;
5115
19147
                     s = u * u + v * v;
5116
19147
                 } while (s >= 1.0 || s == 0.0);
5117
5118
15000
                 double mul = sqrt(-2.0 * log(s) / s);
5119                 // Box-Muller generates two independent values per iteration
5120
15000
                 av_store(result_av, i++, newSVnv(mean + sd * u * mul));
5121
15000
                 if (i < n) {
5122
14997
                     av_store(result_av, i++, newSVnv(mean + sd * v * mul));
5123                 }
5124            }
5125        }
5126
3
        RETVAL = newRV_noinc((SV*)result_av);
5127    }
5128    OUTPUT:
5129      RETVAL
5130
5131SV* aov(data_sv, formula_sv)
5132        SV* data_sv
5133        SV* formula_sv
5134        CODE:
5135        {
5136
27
                const char *restrict formula = SvPV_nolen(formula_sv);
5137                char f_cpy[512];
5138                char *restrict src, *restrict dst, *restrict tilde, *restrict lhs, *restrict rhs, *restrict chunk;
5139
5140
27
                char **restrict terms = NULL, **restrict uniq_terms = NULL, **restrict exp_terms = NULL, **restrict parent_term = NULL;
5141
27
                bool *restrict is_dummy = NULL, *is_interact = NULL;
5142
27
                char **restrict dummy_base = NULL, **restrict dummy_level = NULL;
5143
27
                int *restrict term_map = NULL, *restrict left_idx = NULL, *restrict right_idx = NULL;
5144
27
                unsigned int term_cap = 64, exp_cap = 64, num_terms = 0, num_uniq = 0, p = 0, p_exp = 0;
5145
27
                size_t n = 0, valid_n = 0, i, j;
5146
27
                bool has_intercept = true;
5147
5148
27
                char **restrict row_names = NULL;
5149
27
                HV **restrict row_hashes = NULL;
5150
27
                HV *restrict data_hoa = NULL;
5151
27
                SV *restrict ref = NULL;
5152                HE *restrict entry;
5153
27
                double **restrict X_mat = NULL;
5154
27
                double *restrict Y = NULL;
5155
5156
27
                if (!SvROK(data_sv)) croak("aov: data is required and must be a reference");
5157
5158                // ========================================================================
5159                // PHASE 1: Data Extraction
5160                // ========================================================================
5161
27
                ref = SvRV(data_sv);
5162
27
                if (SvTYPE(ref) == SVt_PVHV) {
5163
27
                        HV*restrict hv = (HV*)ref;
5164
27
                        if (hv_iterinit(hv) == 0) croak("aov: Data hash is empty");
5165
27
                        entry = hv_iternext(hv);
5166
27
                        if (entry) {
5167
27
                                 SV*restrict val = hv_iterval(hv, entry);
5168
27
                                 if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) {
5169
27
                                     data_hoa = hv;
5170
27
                                     n = av_len((AV*)SvRV(val)) + 1;
5171
27
                                     Newx(row_names, n, char*);
5172
201
                                     for(i = 0; i < n; i++) {
5173
174
                                         char buf[32]; snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i+1));
5174
174
                                         row_names[i] = savepv(buf);
5175                                     }
5176
0
                                 } else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) {
5177
0
                                     n = hv_iterinit(hv);
5178
0
                                     Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
5179
0
                                     i = 0;
5180
0
                                     while ((entry = hv_iternext(hv))) {
5181                                         unsigned int len;
5182
0
                                         row_names[i] = savepv(hv_iterkey(entry, &len));
5183
0
                                         row_hashes[i] = (HV*)SvRV(hv_iterval(hv, entry));
5184
0
                                         i++;
5185                                     }
5186
0
                                 } else croak("aov: Hash values must be ArrayRefs (HoA) or HashRefs (HoH)");
5187                        }
5188
0
                } else if (SvTYPE(ref) == SVt_PVAV) {
5189
0
                        AV*restrict av = (AV*)ref;
5190
0
                        n = av_len(av) + 1;
5191
0
                        Newx(row_names, n, char*);
5192
0
                        Newx(row_hashes, n, HV*);
5193
0
                        for (i = 0; i < n; i++) {
5194
0
                                 SV**restrict val = av_fetch(av, i, 0);
5195
0
                                 if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVHV) {
5196
0
                                     row_hashes[i] = (HV*)SvRV(*val);
5197                                     char buf[32];
5198
0
                                     snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i + 1));
5199
0
                                     row_names[i] = savepv(buf);
5200                                 } else {
5201
0
                                     for (size_t k = 0; k < i; k++) Safefree(row_names[k]);
5202
0
                                     Safefree(row_names); Safefree(row_hashes);
5203
0
                                     croak("aov: Array values must be HashRefs (AoH)");
5204                                 }
5205                        }
5206
0
                } else croak("aov: Data must be an Array or Hash reference");
5207
5208                // ========================================================================
5209                // PHASE 2: Formula Parsing & `.` Expansion
5210                // ========================================================================
5211
27
                src = (char*)formula; dst = f_cpy;
5212
333
                while (*src && (dst - f_cpy < 511)) { if (!isspace(*src)) { *dst++ = *src; } src++; }
5213
27
                *dst = '\0';
5214
5215
27
                tilde = strchr(f_cpy, '~');
5216
27
                if (!tilde) {
5217
9
                        for (i = 0; i < n; i++) Safefree(row_names[i]);
5218
3
                        Safefree(row_names); if (row_hashes) Safefree(row_hashes);
5219
3
                        croak("aov: invalid formula, missing '~'");
5220                }
5221
24
                *tilde = '\0';
5222
24
                lhs = f_cpy;
5223
24
                rhs = tilde + 1;
5224
5225                char *restrict p_idx;
5226
24
                while ((p_idx = strstr(rhs, "-1")) != NULL) { has_intercept = false; memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
5227
24
                while ((p_idx = strstr(rhs, "+0")) != NULL) { has_intercept = false; memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
5228
24
                while ((p_idx = strstr(rhs, "0+")) != NULL) { has_intercept = false; memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
5229
24
                if (rhs[0] == '0' && rhs[1] == '\0')        { has_intercept = false; rhs[0] = '\0'; }
5230
24
                while ((p_idx = strstr(rhs, "+1")) != NULL) { memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
5231
24
                if (rhs[0] == '1' && rhs[1] == '\0')        { rhs[0] = '\0'; }
5232
24
                else if (rhs[0] == '1' && rhs[1] == '+')    { memmove(rhs, rhs + 2, strlen(rhs + 2) + 1); }
5233
5234
24
                while ((p_idx = strstr(rhs, "++")) != NULL) memmove(p_idx, p_idx + 1, strlen(p_idx + 1) + 1);
5235
24
                if (rhs[0] == '+') memmove(rhs, rhs + 1, strlen(rhs + 1) + 1);
5236
24
                size_t len_rhs = strlen(rhs);
5237
24
                if (len_rhs > 0 && rhs[len_rhs - 1] == '+') rhs[len_rhs - 1] = '\0';
5238
5239
24
                char rhs_expanded[2048] = "";
5240
24
                size_t rhs_len = 0;
5241
24
                chunk = strtok(rhs, "+");
5242
57
                while (chunk != NULL) {
5243
33
                        if (strcmp(chunk, ".") == 0) {
5244
3
                                 AV *restrict cols = get_all_columns(data_hoa, row_hashes, n);
5245
12
                                 for (size_t c = 0; c <= av_len(cols); c++) {
5246
9
                                     SV **restrict col_sv = av_fetch(cols, c, 0);
5247
9
                                     if (col_sv && SvOK(*col_sv)) {
5248
9
                                         const char *restrict col_name = SvPV_nolen(*col_sv);
5249
9
                                         if (strcmp(col_name, lhs) != 0) {
5250
6
                                             size_t slen = strlen(col_name);
5251
6
                                             if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
5252
6
                                                 if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
5253
6
                                                 strcat(rhs_expanded, col_name);
5254
6
                                                 rhs_len += slen;
5255                                             }
5256                                         }
5257                                     }
5258                                 }
5259
3
                                 SvREFCNT_dec(cols);
5260                        } else {
5261
30
                                 size_t slen = strlen(chunk);
5262
30
                                 if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
5263
30
                                     if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
5264
30
                                     strcat(rhs_expanded, chunk);
5265
30
                                     rhs_len += slen;
5266                                 }
5267                        }
5268
33
                        chunk = strtok(NULL, "+");
5269                }
5270
5271                // Setup arrays safely
5272
24
                Newx(terms, term_cap, char*);
5273
24
                Newx(uniq_terms, term_cap, char*);
5274
24
                Newx(exp_terms, exp_cap, char*); Newx(parent_term, exp_cap, char*);
5275
24
                Newx(is_dummy, exp_cap, bool); Newx(is_interact, exp_cap, bool);
5276
24
                Newx(dummy_base, exp_cap, char*); Newx(dummy_level, exp_cap, char*);
5277
24
                Newx(term_map, exp_cap, int); Newx(left_idx, exp_cap, int); Newx(right_idx, exp_cap, int);
5278
5279
24
                if (has_intercept) { terms[num_terms++] = savepv("Intercept"); }
5280
5281
24
                if (strlen(rhs_expanded) > 0) {
5282
24
                        chunk = strtok(rhs_expanded, "+");
5283
60
                        while (chunk != NULL) {
5284
36
                                 if (num_terms >= term_cap - 3) {
5285
0
                                     term_cap *= 2;
5286
0
                                     Renew(terms, term_cap, char*); Renew(uniq_terms, term_cap, char*);
5287                                 }
5288
36
                                 char *restrict star = strchr(chunk, '*');
5289
36
                                 if (star) {
5290
3
                                     *star = '\0';
5291
3
                                     char *restrict left = chunk;
5292
3
                                     char *right = star + 1;
5293
3
                                     char *restrict c_l = strchr(left, '^');
5294
3
                                     if (c_l && strncmp(left, "I(", 2) != 0) *c_l = '\0';
5295
3
                                     char *restrict c_r = strchr(right, '^'); if (c_r && strncmp(right, "I(", 2) != 0) *c_r = '\0';
5296
3
                                     terms[num_terms++] = savepv(left);
5297
3
                                     terms[num_terms++] = savepv(right);
5298
3
                                     size_t inter_len = strlen(left) + strlen(right) + 2;
5299
3
                                     terms[num_terms] = (char*)safemalloc(inter_len);
5300
3
                                     snprintf(terms[num_terms++], inter_len, "%s:%s", left, right);
5301                                 } else {
5302
33
                                     char *restrict c_chunk = strchr(chunk, '^');
5303
33
                                     if (c_chunk && strncmp(chunk, "I(", 2) != 0) *c_chunk = '\0';
5304
33
                                     terms[num_terms++] = savepv(chunk);
5305                                 }
5306
36
                                 chunk = strtok(NULL, "+");
5307                        }
5308                }
5309
5310
90
                for (i = 0; i < num_terms; i++) {
5311
66
                        bool found = false;
5312
129
                        for (size_t k = 0; k < num_uniq; k++) {
5313
63
                                  if (strcmp(terms[i], uniq_terms[k]) == 0) { found = true; break; }
5314                        }
5315
66
                        if (!found) uniq_terms[num_uniq++] = savepv(terms[i]);
5316                }
5317
24
                p = num_uniq;
5318
5319                // ========================================================================
5320                // PHASE 3: Categorical & Interaction Expansion
5321                // ========================================================================
5322
87
                for (j = 0; j < p; j++) {
5323
66
                        if (p_exp + 64 >= exp_cap) {
5324
24
                                  exp_cap *= 2;
5325
24
                                  Renew(exp_terms, exp_cap, char*); Renew(parent_term, exp_cap, char*);
5326
24
                                  Renew(is_dummy, exp_cap, bool); Renew(is_interact, exp_cap, bool);
5327
24
                                  Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
5328
24
                                  Renew(term_map, exp_cap, int); Renew(left_idx, exp_cap, int); Renew(right_idx, exp_cap, int);
5329                        }
5330
5331
66
                        if (strcmp(uniq_terms[j], "Intercept") == 0) {
5332
24
                                  exp_terms[p_exp] = savepv("Intercept");
5333
24
                                  parent_term[p_exp] = savepv("Intercept");
5334
24
                                  is_dummy[p_exp] = false; is_interact[p_exp] = false;
5335
24
                                  term_map[p_exp] = j;
5336
24
                                  p_exp++;
5337
24
                                  continue;
5338                        }
5339
5340
42
                        char *restrict colon = strchr(uniq_terms[j], ':');
5341
42
                        if (colon) {
5342                                  char left[256], right[256];
5343
6
                                  strncpy(left, uniq_terms[j], colon - uniq_terms[j]);
5344
6
                                  left[colon - uniq_terms[j]] = '\0';
5345
6
                                  strcpy(right, colon + 1);
5346
5347
6
                                  int *restrict l_indices = (int*)safemalloc(p_exp * sizeof(int)); int l_count = 0;
5348
6
                                  int *restrict r_indices = (int*)safemalloc(p_exp * sizeof(int)); int r_count = 0;
5349
18
                                  for (size_t e = 0; e < p_exp; e++) {
5350
12
                                      if (strcmp(parent_term[e], left) == 0) l_indices[l_count++] = e;
5351
12
                                      if (strcmp(parent_term[e], right) == 0) r_indices[r_count++] = e;
5352                                  }
5353
5354
6
                                  if (l_count == 0 || r_count == 0) {
5355
3
                                      Safefree(l_indices); Safefree(r_indices);
5356
3
                                      croak("aov: Interaction term '%s' requires its main effects to be explicitly included in the formula", uniq_terms[j]);
5357                                  } else {
5358
6
                                      for (int li = 0; li < l_count; li++) {
5359
6
                                          for (int ri = 0; ri < r_count; ri++) {
5360
3
                                              if (p_exp >= exp_cap) {
5361
0
                                                  exp_cap *= 2;
5362
0
                                                  Renew(exp_terms, exp_cap, char*); Renew(parent_term, exp_cap, char*);
5363
0
                                                  Renew(is_dummy, exp_cap, bool); Renew(is_interact, exp_cap, bool);
5364
0
                                                  Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
5365
0
                                                  Renew(term_map, exp_cap, int); Renew(left_idx, exp_cap, int); Renew(right_idx, exp_cap, int);
5366                                              }
5367
3
                                              size_t t_len = strlen(exp_terms[l_indices[li]]) + strlen(exp_terms[r_indices[ri]]) + 2;
5368
3
                                              exp_terms[p_exp] = (char*)safemalloc(t_len);
5369
3
                                              snprintf(exp_terms[p_exp], t_len, "%s:%s", exp_terms[l_indices[li]], exp_terms[r_indices[ri]]);
5370
3
                                              parent_term[p_exp] = savepv(uniq_terms[j]);
5371
3
                                              is_dummy[p_exp] = false; is_interact[p_exp] = true;
5372
3
                                              left_idx[p_exp] = l_indices[li];
5373
3
                                              right_idx[p_exp] = r_indices[ri];
5374
3
                                              term_map[p_exp] = j;
5375
3
                                              p_exp++;
5376                                          }
5377                                      }
5378                                  }
5379
3
                                  Safefree(l_indices); Safefree(r_indices);
5380                        } else {
5381
36
                                  if (is_column_categorical(data_hoa, row_hashes, n, uniq_terms[j])) {
5382
9
                                      char **restrict levels = NULL;
5383
9
                                      unsigned int num_levels = 0, levels_cap = 8;
5384
9
                                      Newx(levels, levels_cap, char*);
5385
156
                                      for (i = 0; i < n; i++) {
5386
147
                                          char* str_val = get_data_string_alloc(data_hoa, row_hashes, i, uniq_terms[j]);
5387
147
                                          if (str_val) {
5388
147
                                              bool found = false;
5389
234
                                              for (size_t l = 0; l < num_levels; l++) {
5390
213
                                                  if (strcmp(levels[l], str_val) == 0) { found = true; break; }
5391                                              }
5392
147
                                              if (!found) {
5393
21
                                                  if (num_levels >= levels_cap) { levels_cap *= 2; Renew(levels, levels_cap, char*); }
5394
21
                                                  levels[num_levels++] = savepv(str_val);
5395                                              }
5396
147
                                              Safefree(str_val);
5397                                          }
5398                                      }
5399
5400
9
                                      if (num_levels > 0) {
5401
21
                                          for (size_t l1 = 0; l1 < num_levels - 1; l1++) {
5402
27
                                              for (size_t l2 = l1 + 1; l2 < num_levels; l2++) {
5403
15
                                                  if (strcmp(levels[l1], levels[l2]) > 0) {
5404
3
                                                      char *tmp = levels[l1]; levels[l1] = levels[l2]; levels[l2] = tmp;
5405                                                  }
5406                                              }
5407                                          }
5408
21
                                          for (size_t l = 1; l < num_levels; l++) {
5409
12
                                              if (p_exp >= exp_cap) {
5410
0
                                                  exp_cap *= 2;
5411
0
                                                  Renew(exp_terms, exp_cap, char*); Renew(parent_term, exp_cap, char*);
5412
0
                                                  Renew(is_dummy, exp_cap, bool); Renew(is_interact, exp_cap, bool);
5413
0
                                                  Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
5414
0
                                                  Renew(term_map, exp_cap, int); Renew(left_idx, exp_cap, int); Renew(right_idx, exp_cap, int);
5415                                              }
5416
12
                                              size_t t_len = strlen(uniq_terms[j]) + strlen(levels[l]) + 1;
5417
12
                                              exp_terms[p_exp] = (char*)safemalloc(t_len);
5418
12
                                              snprintf(exp_terms[p_exp], t_len, "%s%s", uniq_terms[j], levels[l]);
5419
12
                                              parent_term[p_exp] = savepv(uniq_terms[j]);
5420
12
                                              is_dummy[p_exp] = true; is_interact[p_exp] = false;
5421
12
                                              dummy_base[p_exp] = savepv(uniq_terms[j]);
5422
12
                                              dummy_level[p_exp] = savepv(levels[l]);
5423
12
                                              term_map[p_exp] = j;
5424
12
                                              p_exp++;
5425                                          }
5426
30
                                          for (size_t l = 0; l < num_levels; l++) Safefree(levels[l]);
5427
9
                                          Safefree(levels);
5428                                      } else {
5429
0
                                          Safefree(levels);
5430
0
                                          exp_terms[p_exp] = savepv(uniq_terms[j]);
5431
0
                                          parent_term[p_exp] = savepv(uniq_terms[j]);
5432
0
                                          is_dummy[p_exp] = false; is_interact[p_exp] = false;
5433
0
                                          term_map[p_exp] = j;
5434
0
                                          p_exp++;
5435                                      }
5436                                  } else {
5437
27
                                      exp_terms[p_exp] = savepv(uniq_terms[j]);
5438
27
                                      parent_term[p_exp] = savepv(uniq_terms[j]);
5439
27
                                      is_dummy[p_exp] = false; is_interact[p_exp] = false;
5440
27
                                      term_map[p_exp] = j;
5441
27
                                      p_exp++;
5442                                  }
5443                        }
5444                }
5445
21
                X_mat = (double**)safemalloc(n * sizeof(double*));
5446
177
                for(i = 0; i < n; i++) X_mat[i] = (double*)safemalloc(p_exp * sizeof(double));
5447
21
                Newx(Y, n, double);
5448                // ========================================================================
5449                // PHASE 4: Matrix Construction & Listwise Deletion
5450                // ========================================================================
5451
177
                for (i = 0; i < n; i++) {
5452
156
                        double y_val = evaluate_term(data_hoa, row_hashes, i, lhs);
5453
156
                        if (isnan(y_val)) { Safefree(row_names[i]); continue; }
5454
156
                        bool row_ok = true;
5455
156
                        double *restrict row_x = (double*)safemalloc(p_exp * sizeof(double));
5456
666
                        for (j = 0; j < p_exp; j++) {
5457
510
                                  if (strcmp(exp_terms[j], "Intercept") == 0) {
5458
156
                                      row_x[j] = 1.0;
5459
354
                                  } else if (is_interact[j]) {
5460
60
                                      row_x[j] = row_x[left_idx[j]] * row_x[right_idx[j]];
5461
294
                                  } else if (is_dummy[j]) {
5462
174
                                      char*restrict str_val = get_data_string_alloc(data_hoa, row_hashes, i, dummy_base[j]);
5463
174
                                      if (str_val) {
5464
174
                                          row_x[j] = (strcmp(str_val, dummy_level[j]) == 0) ? 1.0 : 0.0;
5465
174
                                          Safefree(str_val);
5466
0
                                      } else { row_ok = false; break; }
5467                                  } else {
5468
120
                                      row_x[j] = evaluate_term(data_hoa, row_hashes, i, parent_term[j]);
5469
120
                                      if (isnan(row_x[j])) { row_ok = false; break; }
5470                                  }
5471                        }
5472
156
                        if (!row_ok) { Safefree(row_names[i]); Safefree(row_x); continue; }
5473
5474
156
                        Y[valid_n] = y_val;
5475
666
                        for (j = 0; j < p_exp; j++) X_mat[valid_n][j] = row_x[j];
5476
156
                        valid_n++;
5477
156
                        Safefree(row_x);
5478
156
                        Safefree(row_names[i]);
5479                }
5480
21
                Safefree(row_names);
5481
21
                if (valid_n <= p_exp) {
5482                        // Full Clean Up
5483
12
                        for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
5484
12
                        for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
5485
12
                        for (j = 0; j < p_exp; j++) {
5486
9
                                 Safefree(exp_terms[j]); Safefree(parent_term[j]);
5487
9
                                 if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
5488                        }
5489
3
                        Safefree(exp_terms); Safefree(parent_term);
5490
3
                        Safefree(is_dummy); Safefree(is_interact);
5491
3
                        Safefree(dummy_base); Safefree(dummy_level);
5492
3
                        Safefree(term_map); Safefree(left_idx); Safefree(right_idx);
5493
9
                        for(i = 0; i < n; i++) Safefree(X_mat[i]);
5494
3
                        Safefree(X_mat); Safefree(Y);
5495
3
                        if (row_hashes) Safefree(row_hashes);
5496
3
                        croak("aov: 0 degrees of freedom (too many NAs or parameters > observations)");
5497                }
5498                // ========================================================================
5499                // PHASE 5: Math & Output Formatting
5500                // ========================================================================
5501
18
                bool *restrict aliased_qr = (bool*)safemalloc(p_exp * sizeof(bool));
5502
18
                size_t *restrict rank_map = (size_t*)safemalloc(p_exp * sizeof(size_t));
5503
18
                apply_householder_aov(X_mat, Y, valid_n, p_exp, aliased_qr, rank_map);
5504                double *restrict term_ss;
5505                int *restrict term_df;
5506
18
                Newxz(term_ss, num_uniq, double);
5507
18
                Newxz(term_df, num_uniq, int);
5508
5509
72
                for (i = 0; i < p_exp; i++) {
5510
54
                        if (strcmp(exp_terms[i], "Intercept") == 0) continue;
5511
36
                        if (aliased_qr[i]) continue;
5512
33
                        int t_idx = term_map[i];
5513
33
                        size_t r_k = rank_map[i];
5514
33
                        term_ss[t_idx] += Y[r_k] * Y[r_k];
5515
33
                        term_df[t_idx] += 1;
5516                }
5517
18
                int rank = 0;
5518
72
                for (i = 0; i < p_exp; i++) {
5519
54
                        if (!aliased_qr[i]) rank++;
5520                }
5521
18
                double rss_prev = 0.0;
5522
117
                for (i = rank; i < valid_n; i++) {
5523
99
                        rss_prev += Y[i] * Y[i];
5524                }
5525
18
                int res_df = valid_n - rank;
5526
18
                double ms_res = (res_df > 0) ? rss_prev / res_df : 0.0;
5527
5528
18
                HV*restrict ret_hash = newHV();
5529
69
                for (j = 0; j < num_uniq; j++) {
5530
51
                        if (strcmp(uniq_terms[j], "Intercept") == 0) continue;
5531
33
                        HV*restrict term_stats = newHV();
5532
33
                        double ss = term_ss[j];
5533
33
                        int df = term_df[j];
5534
33
                        double ms = (df > 0) ? ss / df : 0.0;
5535
5536
33
                        hv_stores(term_stats, "Df", newSViv(df));
5537
33
                        hv_stores(term_stats, "Sum Sq", newSVnv(ss));
5538
33
                        hv_stores(term_stats, "Mean Sq", newSVnv(ms));
5539
63
                        if (ms_res > 0.0 && df > 0) {
5540
30
                                  double f_val = ms / ms_res;
5541
30
                                  hv_stores(term_stats, "F value", newSVnv(f_val));
5542
30
                                  hv_stores(term_stats, "Pr(>F)", newSVnv(1.0 - pf(f_val, (double)df, (double)res_df)));
5543                        } else {
5544
3
                                  hv_stores(term_stats, "F value", newSVnv(NAN));
5545
3
                                  hv_stores(term_stats, "Pr(>F)", newSVnv(NAN));
5546                        }
5547
33
                        hv_store(ret_hash, uniq_terms[j], strlen(uniq_terms[j]), newRV_noinc((SV*)term_stats), 0);
5548                }
5549
5550
18
                HV*restrict res_stats = newHV();
5551
18
                hv_stores(res_stats, "Df", newSViv(res_df));
5552
18
                hv_stores(res_stats, "Sum Sq", newSVnv(rss_prev));
5553
18
                hv_stores(res_stats, "Mean Sq", newSVnv(ms_res));
5554
18
                hv_stores(ret_hash, "Residuals", newRV_noinc((SV*)res_stats));
5555
5556                // Deep Cleanup
5557
69
                for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
5558
69
                for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
5559
72
                for (j = 0; j < p_exp; j++) {
5560
54
                        Safefree(exp_terms[j]); Safefree(parent_term[j]);
5561
54
                        if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
5562                }
5563
18
                Safefree(exp_terms); Safefree(parent_term);
5564
18
                Safefree(is_dummy); Safefree(is_interact);
5565
18
                Safefree(dummy_base); Safefree(dummy_level);
5566
18
                Safefree(term_map); Safefree(left_idx); Safefree(right_idx);
5567
18
                Safefree(term_ss); Safefree(term_df);
5568
168
                for (i = 0; i < n; i++) Safefree(X_mat[i]);
5569
18
                Safefree(X_mat); Safefree(Y);
5570
18
                Safefree(aliased_qr); Safefree(rank_map);
5571
18
                if (row_hashes) Safefree(row_hashes);
5572
18
                RETVAL = newRV_noinc((SV*)ret_hash);
5573        }
5574OUTPUT:
5575    RETVAL
5576
5577PROTOTYPES: DISABLE
5578
5579SV* fisher_test(...)
5580CODE:
5581{
5582
18
        if (items < 1) croak("fisher_test requires at least a data reference");
5583
5584
15
        SV*restrict data_ref = ST(0);
5585
15
        double conf_level = 0.95;
5586
15
        const char*restrict alternative = "two.sided";
5587
5588        // Parse named arguments
5589
21
        for (unsigned short int i = 1; i < items; i += 2) {
5590
6
                if (i + 1 >= items) croak("fisher_test: odd number of arguments");
5591
6
                const char*restrict key = SvPV_nolen(ST(i));
5592
6
                SV*restrict val = ST(i + 1);
5593
6
                if (strEQ(key, "conf_level") || strEQ(key, "conf.level")) {
5594
0
                        conf_level = SvNV(val);
5595
6
                } else if (strEQ(key, "alternative")) {
5596
6
                        alternative = SvPV_nolen(val);
5597                }
5598        }
5599
5600
15
        if (!SvROK(data_ref)) croak("fisher_test requires a reference to an Array or Hash");
5601
15
        SV*restrict deref = SvRV(data_ref);
5602
15
        size_t a = 0, b = 0, c = 0, d = 0;
5603        // Extract Data
5604
15
        if (SvTYPE(deref) == SVt_PVAV) {
5605
6
          AV*restrict outer = (AV*)deref;
5606
6
          if (av_len(outer) != 1) croak("Outer array must have exactly 2 rows");
5607
6
          SV**restrict row1_ptr = av_fetch(outer, 0, 0);
5608
6
          SV**restrict row2_ptr = av_fetch(outer, 1, 0);
5609
6
          if (row1_ptr && row2_ptr && SvROK(*row1_ptr) && SvROK(*row2_ptr)) {
5610
6
                   AV*restrict row1 = (AV*)SvRV(*row1_ptr);
5611
6
                   AV*restrict row2 = (AV*)SvRV(*row2_ptr);
5612
6
                   SV**restrict a_ptr = av_fetch(row1, 0, 0);
5613
6
                   SV**restrict b_ptr = av_fetch(row1, 1, 0);
5614
6
                   SV**restrict c_ptr = av_fetch(row2, 0, 0);
5615
6
                   SV**restrict d_ptr = av_fetch(row2, 1, 0);
5616
6
                   a = (a_ptr && SvOK(*a_ptr)) ? SvIV(*a_ptr) : 0;
5617
6
                   b = (b_ptr && SvOK(*b_ptr)) ? SvIV(*b_ptr) : 0;
5618
6
                   c = (c_ptr && SvOK(*c_ptr)) ? SvIV(*c_ptr) : 0;
5619
6
                   d = (d_ptr && SvOK(*d_ptr)) ? SvIV(*d_ptr) : 0;
5620          } else {
5621
0
                  croak("Invalid 2D Array structure");
5622          }
5623
9
        } else if (SvTYPE(deref) == SVt_PVHV) {
5624                // Fixed 2D Hash Logic: Sort keys lexically to enforce structured rows/columns
5625
9
                HV*restrict outer = (HV*)deref;
5626
9
                if (hv_iterinit(outer) != 2) croak("Outer hash must have exactly 2 keys");
5627
9
                HE*restrict he1 = hv_iternext(outer);
5628
9
                HE*restrict he2 = hv_iternext(outer);
5629
9
                if (!he1 || !he2) croak("Invalid outer hash");
5630
9
                const char*restrict k1 = SvPV_nolen(hv_iterkeysv(he1));
5631
9
                const char*restrict k2 = SvPV_nolen(hv_iterkeysv(he2));
5632
9
                HE*restrict row1_he = (strcmp(k1, k2) < 0) ? he1 : he2;
5633
9
                HE*restrict row2_he = (strcmp(k1, k2) < 0) ? he2 : he1;
5634
9
                SV*restrict row1_sv = hv_iterval(outer, row1_he);
5635
9
                SV*restrict row2_sv = hv_iterval(outer, row2_he);
5636
9
                if (!SvROK(row1_sv) || SvTYPE(SvRV(row1_sv)) != SVt_PVHV ||
5637
9
                        !SvROK(row2_sv) || SvTYPE(SvRV(row2_sv)) != SVt_PVHV) {
5638
0
                        croak("Inner elements must be hashes");
5639                }
5640
9
                HV*restrict in1 = (HV*)SvRV(row1_sv);
5641
9
                HV*restrict in2 = (HV*)SvRV(row2_sv);
5642
9
                if (hv_iterinit(in1) != 2 || hv_iterinit(in2) != 2) croak("Inner hashes must have exactly 2 keys");
5643
9
                HE*restrict in1_he1 = hv_iternext(in1);
5644
9
                HE*restrict in1_he2 = hv_iternext(in1);
5645
9
                const char*restrict in1_k1 = SvPV_nolen(hv_iterkeysv(in1_he1));
5646
9
                const char*restrict in1_k2 = SvPV_nolen(hv_iterkeysv(in1_he2));
5647
9
                HE*restrict in1_c1 = (strcmp(in1_k1, in1_k2) < 0) ? in1_he1 : in1_he2;
5648
9
                HE*restrict in1_c2 = (strcmp(in1_k1, in1_k2) < 0) ? in1_he2 : in1_he1;
5649
9
                HE*restrict in2_he1 = hv_iternext(in2);
5650
9
                HE*restrict in2_he2 = hv_iternext(in2);
5651
9
                const char*restrict in2_k1 = SvPV_nolen(hv_iterkeysv(in2_he1));
5652
9
                const char*restrict in2_k2 = SvPV_nolen(hv_iterkeysv(in2_he2));
5653
9
                HE*restrict in2_c1 = (strcmp(in2_k1, in2_k2) < 0) ? in2_he1 : in2_he2;
5654
9
                HE*restrict in2_c2 = (strcmp(in2_k1, in2_k2) < 0) ? in2_he2 : in2_he1;
5655
9
                a = (hv_iterval(in1, in1_c1) && SvOK(hv_iterval(in1, in1_c1))) ? SvIV(hv_iterval(in1, in1_c1)) : 0;
5656
9
                b = (hv_iterval(in1, in1_c2) && SvOK(hv_iterval(in1, in1_c2))) ? SvIV(hv_iterval(in1, in1_c2)) : 0;
5657
9
                c = (hv_iterval(in2, in2_c1) && SvOK(hv_iterval(in2, in2_c1))) ? SvIV(hv_iterval(in2, in2_c1)) : 0;
5658
9
                d = (hv_iterval(in2, in2_c2) && SvOK(hv_iterval(in2, in2_c2))) ? SvIV(hv_iterval(in2, in2_c2)) : 0;
5659        } else {
5660
0
          croak("Input must be a 2D Array or 2D Hash");
5661        }
5662
5663        // Perform Calculations via Helpers
5664
15
        double p_val = exact_p_value(a, b, c, d, alternative);
5665        double mle_or, ci_low, ci_high;
5666
15
        calculate_exact_stats(a, b, c, d, conf_level, alternative, &mle_or, &ci_low, &ci_high);
5667
5668        // Construct the Return HashRef purely in C
5669
15
        HV*restrict ret_hash = newHV();
5670
15
        hv_stores(ret_hash, "method", newSVpv("Fisher's Exact Test for Count Data", 0));
5671
15
        hv_stores(ret_hash, "alternative", newSVpv(alternative, 0));
5672
15
        AV*restrict ci_array = newAV();
5673
15
        av_push(ci_array, newSVnv(ci_low));
5674
15
        av_push(ci_array, newSVnv(ci_high));
5675
15
        hv_stores(ret_hash, "conf_int", newRV_noinc((SV*)ci_array));
5676
15
        HV*restrict est_hash = newHV();
5677
15
        hv_stores(ret_hash, "estimate", newRV_noinc((SV*)est_hash));
5678
15
        hv_stores(est_hash, "odds ratio", newSVnv(mle_or));
5679
15
        hv_stores(ret_hash, "p_value", newSVnv(p_val));
5680        // Return the HashRef
5681
15
        RETVAL = newRV_noinc((SV*)ret_hash);
5682}
5683OUTPUT:
5684  RETVAL
5685
5686SV* power_t_test(...)
5687CODE:
5688{
5689
21
    SV* sv_n = NULL;
5690
21
    SV* sv_delta = NULL;
5691
21
    SV* sv_sd = NULL;
5692
21
    SV* sv_sig_level = NULL;
5693
21
    SV* sv_power = NULL;
5694
5695
21
    const char* restrict type = "two.sample";
5696
21
    const char* restrict alternative = "two.sided";
5697
21
    bool strict = false;
5698
21
    double tol = pow(2.2204460492503131e-16, 0.25);
5699
5700
21
    if (items % 2 != 0) croak("Usage: power_t_test(n => 30, delta => 0.5, sd => 1.0, ...)");
5701
102
    for (unsigned short int i = 0; i < items; i += 2) {
5702
81
        const char* restrict key = SvPV_nolen(ST(i));
5703
81
        SV* restrict val = ST(i+1);
5704
5705
81
        if      (strEQ(key, "n"))           sv_n = val;
5706
78
        else if (strEQ(key, "delta"))       sv_delta = val;
5707
57
        else if (strEQ(key, "sd"))          sv_sd = val;
5708
36
        else if (strEQ(key, "sig.level") || strEQ(key, "sig_level")) sv_sig_level = val;
5709
33
        else if (strEQ(key, "power"))       sv_power = val;
5710
15
        else if (strEQ(key, "type"))        type = SvPV_nolen(val);
5711
6
        else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
5712
0
        else if (strEQ(key, "strict"))      strict = SvTRUE(val);
5713
0
        else if (strEQ(key, "tol"))         tol = SvNV(val);
5714
0
        else croak("power_t_test: unknown argument '%s'", key);
5715    }
5716
5717
21
    bool is_null_n = (!sv_n || !SvOK(sv_n));
5718
21
    bool is_null_delta = (!sv_delta || !SvOK(sv_delta));
5719
21
    bool is_null_power = (!sv_power || !SvOK(sv_power));
5720
21
    bool is_null_sd = (sv_sd && !SvOK(sv_sd));
5721
21
    bool is_null_sig_level = (sv_sig_level && !SvOK(sv_sig_level));
5722
5723
21
    int missing_count = 0;
5724
21
    if (is_null_n) missing_count++;
5725
21
    if (is_null_delta) missing_count++;
5726
21
    if (is_null_power) missing_count++;
5727
21
    if (is_null_sd) missing_count++;
5728
21
    if (is_null_sig_level) missing_count++;
5729
5730
21
    if (missing_count != 1) {
5731
0
        croak("power_t_test: exactly one of 'n', 'delta', 'sd', 'power', and 'sig_level' must be undef/NULL");
5732    }
5733
5734
21
    double n = is_null_n ? 0.0 : SvNV(sv_n);
5735
21
    double delta = is_null_delta ? 0.0 : SvNV(sv_delta);
5736
21
    double sd = (!sv_sd || is_null_sd) ? 1.0 : SvNV(sv_sd);
5737
21
    double sig_level = (!sv_sig_level || is_null_sig_level) ? 0.05 : SvNV(sv_sig_level);
5738
21
    double power = is_null_power ? 0.0 : SvNV(sv_power);
5739
21
    short int tsample = (strEQ(type, "one.sample") || strEQ(type, "paired")) ? 1 : 2;
5740
21
    short int tside = (strEQ(alternative, "one.sided") || strEQ(alternative, "greater") || strEQ(alternative, "less")) ? 1 : 2;
5741
21
    if (tside == 2 && !is_null_delta) delta = fabs(delta);
5742
21
    if (is_null_power) {
5743
3
        power = p_body(n, delta, sd, sig_level, tsample, tside, strict);
5744
18
    } else if (is_null_n) {
5745
18
        double low = 2.0, high = 1e7;
5746
18
        while (p_body(high, delta, sd, sig_level, tsample, tside, strict) < power && high < 1e12) high *= 2.0;
5747
684
        while (high - low > tol) {
5748
666
            double mid = low + (high - low) / 2.0;
5749
666
            if (p_body(mid, delta, sd, sig_level, tsample, tside, strict) < power) low = mid;
5750
519
            else high = mid;
5751        }
5752
18
        n = low + (high - low) / 2.0;
5753
0
    } else if (is_null_sd) {
5754
0
        double low = delta * 1e-7, high = delta * 1e7;
5755
0
        while (high - low > tol) {
5756
0
            double mid = low + (high - low) / 2.0;
5757
0
            if (p_body(n, delta, mid, sig_level, tsample, tside, strict) > power) low = mid;
5758
0
            else high = mid;
5759        }
5760
0
        sd = low + (high - low) / 2.0;
5761
0
    } else if (is_null_delta) {
5762
0
        double low = sd * 1e-7, high = sd * 1e7;
5763
0
        while (p_body(n, high, sd, sig_level, tsample, tside, strict) < power && high < 1e12) high *= 2.0;
5764
0
        while (high - low > tol) {
5765
0
            double mid = low + (high - low) / 2.0;
5766
0
            if (p_body(n, mid, sd, sig_level, tsample, tside, strict) < power) low = mid;
5767
0
            else high = mid;
5768        }
5769
0
        delta = low + (high - low) / 2.0;
5770
0
    } else if (is_null_sig_level) {
5771
0
        double low = 1e-10, high = 1.0 - 1e-10;
5772
0
        while (high - low > tol) {
5773
0
            double mid = low + (high - low) / 2.0;
5774
0
            if (p_body(n, delta, sd, mid, tsample, tside, strict) < power) low = mid;
5775
0
            else high = mid;
5776        }
5777
0
        sig_level = low + (high - low) / 2.0;
5778    }
5779
21
    HV*restrict ret = newHV();
5780
21
    hv_stores(ret, "n", newSVnv(n));
5781
21
    hv_stores(ret, "delta", newSVnv(delta));
5782
21
    hv_stores(ret, "sd", newSVnv(sd));
5783
21
    hv_stores(ret, "sig.level", newSVnv(sig_level));
5784
21
    hv_stores(ret, "power", newSVnv(power));
5785
21
    hv_stores(ret, "alternative", newSVpv(alternative, 0));
5786
21
    const char*restrict m_str = (tsample == 1) ? (strEQ(type, "paired") ? "Paired t test power calculation" : "One-sample t test power calculation") : "Two-sample t test power calculation";
5787
21
    hv_stores(ret, "method", newSVpv(m_str, 0));
5788
21
    const char*restrict n_str = (tsample == 2) ? "n is number in *each* group" : (strEQ(type, "paired") ? "n is number of *pairs*, sd is std.dev. of *differences* within pairs" : "");
5789
21
    if (n_str[0] != '\0') hv_stores(ret, "note", newSVpv(n_str, 0));
5790
21
    RETVAL = newRV_noinc((SV*)ret);
5791}
5792OUTPUT:
5793    RETVAL
5794
5795SV* kruskal_test(...)
5796CODE:
5797{
5798
8
        SV *restrict x_sv = NULL, *restrict g_sv = NULL, *restrict h_sv = NULL;
5799
8
        unsigned int arg_idx = 0;
5800
5801        // 1. Shift positional arguments
5802        //    Accept either: (arrayref, arrayref) or (hashref)
5803
8
        if (arg_idx < items && SvROK(ST(arg_idx))) {
5804
6
                svtype t = SvTYPE(SvRV(ST(arg_idx)));
5805
6
                if (t == SVt_PVAV) {
5806
3
                        x_sv = ST(arg_idx++);
5807
3
                } else if (t == SVt_PVHV) {
5808
3
                        h_sv = ST(arg_idx++);          /* hash-of-arrays shortcut */
5809                }
5810        }
5811
8
        if (!h_sv && arg_idx < items
5812
5
                  && SvROK(ST(arg_idx))
5813
3
                  && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
5814
3
                g_sv = ST(arg_idx++);
5815        }
5816        // 2. Parse named arguments (fallback)
5817
12
        for (; arg_idx < items; arg_idx += 2) {
5818
4
                const char *restrict key = SvPV_nolen(ST(arg_idx));
5819
4
                SV         *restrict val = ST(arg_idx + 1);
5820
4
                if      (strEQ(key, "x")) x_sv = val;
5821
2
                else if (strEQ(key, "g")) g_sv = val;
5822
0
                else if (strEQ(key, "h")) h_sv = val;
5823
0
                else croak("kruskal_test: unknown argument '%s'", key);
5824        }
5825        // 3. Mutual-exclusion guard
5826
8
        if (h_sv && (x_sv || g_sv))
5827
0
                croak("kruskal_test: cannot mix 'h' (hash-of-arrays) with 'x'/'g' inputs");
5828        /* ------------------------------------------------------------------ */
5829        /* Shared state filled by whichever input branch runs                  */
5830        /* ------------------------------------------------------------------ */
5831
8
        RankInfo *restrict ri = NULL;
5832
8
        size_t valid_n = 0;
5833
8
        size_t k       = 0;
5834        /* ------------------------------------------------------------------ */
5835        /* 4a. Hash-of-arrays input path                                       */
5836        /*     my %x = ( group1 => [...], group2 => [...], ... )              */
5837        /* ------------------------------------------------------------------ */
5838
8
        if (h_sv) {
5839
3
                if (!SvROK(h_sv) || SvTYPE(SvRV(h_sv)) != SVt_PVHV)
5840
0
                        croak("kruskal_test: 'h' must be a HASH reference");
5841
3
                HV *restrict h_hv = (HV*)SvRV(h_sv);
5842                // First pass – validate values and tally total elements
5843
3
                size_t total = 0;
5844
3
                hv_iterinit(h_hv);
5845                HE *restrict he;
5846
12
                while ((he = hv_iternext(h_hv))) {
5847
9
                        SV *restrict val = HeVAL(he);
5848
9
                        if (!SvROK(val) || SvTYPE(SvRV(val)) != SVt_PVAV)
5849
0
                                croak("kruskal_test: every value in 'h' must be an ARRAY reference");
5850
9
                        total += (size_t)(av_len((AV*)SvRV(val)) + 1);
5851                }
5852
3
                if (total < 2) croak("not enough observations");
5853
5854
3
                ri = (RankInfo *)safemalloc(total * sizeof(RankInfo));
5855
5856                /* Second pass – fill ri[], assigning one group_id per hash key */
5857
3
                size_t group_id = 0;
5858
3
                hv_iterinit(h_hv);
5859
12
                while ((he = hv_iternext(h_hv))) {
5860
9
                        AV    *restrict av  = (AV*)SvRV(HeVAL(he));
5861
9
                        size_t          n_g = (size_t)(av_len(av) + 1);
5862
51
                        for (size_t i = 0; i < n_g; i++) {
5863
42
                                SV **restrict el = av_fetch(av, i, 0);
5864
42
                                if (el && SvOK(*el) && looks_like_number(*el)) {
5865
42
                                        ri[valid_n].val = SvNV(*el);
5866
42
                                        ri[valid_n].idx = group_id;   /* group identity */
5867
42
                                        valid_n++;
5868                                }
5869                        }
5870
9
                        group_id++;
5871                }
5872
3
                k = group_id;   /* number of unique groups = number of hash keys */
5873
5874        /* ------------------------------------------------------------------
5875        /* 4b. Original x / g array-pair input path (unchanged)
5876        /* ------------------------------------------------------------------ */
5877        } else {
5878
5
                if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
5879
0
                        croak("kruskal_test: 'x' is a required argument and must be an ARRAY reference");
5880
5
                if (!g_sv || !SvROK(g_sv) || SvTYPE(SvRV(g_sv)) != SVt_PVAV)
5881
0
                        croak("kruskal_test: 'g' is a required argument and must be an ARRAY reference");
5882
5883
5
                AV *restrict x_av = (AV*)SvRV(x_sv);
5884
5
                AV *restrict g_av = (AV*)SvRV(g_sv);
5885
5
                size_t nx = (size_t)(av_len(x_av) + 1);
5886
5
                size_t ng = (size_t)(av_len(g_av) + 1);
5887
5
                if (nx != ng) croak("kruskal_test: 'x' and 'g' must have the same length");
5888
5
                if (nx < 2)   croak("not enough observations");
5889
5890
5
                ri = (RankInfo *)safemalloc(nx * sizeof(RankInfo));
5891
5892                // Map string group names → contiguous integer IDs
5893
5
                HV    *restrict group_map    = newHV();
5894
5
                size_t          next_group_id = 0;
5895
5896
75
                for (size_t i = 0; i < nx; i++) {
5897
70
                        SV **restrict x_el = av_fetch(x_av, i, 0);
5898
70
                        SV **restrict g_el = av_fetch(g_av, i, 0);
5899
70
                        if (x_el && SvOK(*x_el) && looks_like_number(*x_el)
5900
70
                                 && g_el && SvOK(*g_el)) {
5901
70
                                const char *restrict g_str = SvPV_nolen(*g_el);
5902
70
                                STRLEN                glen  = strlen(g_str);
5903
70
                                SV   **restrict id_sv = hv_fetch(group_map, g_str, glen, 0);
5904                                size_t group_id;
5905
70
                                if (id_sv) {
5906
55
                                        group_id = SvUV(*id_sv);
5907                                } else {
5908
15
                                        group_id = next_group_id++;
5909
15
                                        hv_store(group_map, g_str, glen, newSVuv(group_id), 0);
5910                                }
5911
70
                                ri[valid_n].val = SvNV(*x_el);
5912
70
                                ri[valid_n].idx = group_id;
5913
70
                                valid_n++;
5914                        }
5915                }
5916
5
                k = next_group_id;
5917
5
                SvREFCNT_dec(group_map);
5918        }
5919
5920        /* ------------------------------------------------------------------ */
5921        /* 5. Shared post-extraction validation                                */
5922        /* ------------------------------------------------------------------ */
5923
8
        if (valid_n < 2) { Safefree(ri); croak("not enough observations");            }
5924
8
        if (k       < 2) { Safefree(ri); croak("all observations are in the same group"); }
5925
5926        // 6. Ranking and Tie Accumulation (Reusing LikeR Helper)
5927
8
        bool   has_ties = 0;
5928
8
        double tie_adj  = rank_and_count_ties(ri, valid_n, &has_ties);
5929
5930        // 7. Aggregate Sum of Ranks by Group (tapply(r, g, sum))
5931
8
        double *restrict group_rank_sums = (double *)safecalloc(k, sizeof(double));
5932
8
        size_t *restrict group_counts    = (size_t *)safecalloc(k, sizeof(size_t));
5933
120
        for (size_t i = 0; i < valid_n; i++) {
5934
112
                size_t g_id = ri[i].idx;
5935
112
                group_rank_sums[g_id] += ri[i].rank;
5936
112
                group_counts[g_id]++;
5937        }
5938
5939        // 8. Calculate STATISTIC
5940
8
        double stat_base = 0.0;
5941
32
        for (size_t i = 0; i < k; i++) {
5942
24
                if (group_counts[i] > 0)
5943
24
                        stat_base += (group_rank_sums[i] * group_rank_sums[i])
5944
24
                                     / (double)group_counts[i];
5945        }
5946
8
        Safefree(group_rank_sums); Safefree(group_counts); Safefree(ri);
5947
5948
8
        double n_d  = (double)valid_n;
5949
8
        double stat = (12.0 * stat_base / (n_d * (n_d + 1.0))) - 3.0 * (n_d + 1.0);
5950
8
        if (tie_adj > 0.0) {
5951
0
                double tie_denom = 1.0 - (tie_adj / (n_d * n_d * n_d - n_d));
5952
0
                stat /= tie_denom;
5953        }
5954
8
        int    df    = (int)k - 1;
5955
8
        double p_val = get_p_value(stat, df);
5956
5957        // 9. Return structured data exactly like R's htest
5958
8
        HV *restrict res = newHV();
5959
8
        hv_stores(res, "statistic", newSVnv(stat));
5960
8
        hv_stores(res, "parameter", newSViv(df));
5961
8
        hv_stores(res, "p_value",   newSVnv(p_val));
5962
8
        hv_stores(res, "p.value",   newSVnv(p_val));
5963
8
        hv_stores(res, "method",    newSVpv("Kruskal-Wallis rank sum test", 0));
5964
8
        RETVAL = newRV_noinc((SV*)res);
5965}
5966OUTPUT:
5967        RETVAL