From ea4682c35ac1903ae8f5671399955ef105e1b4aa Mon Sep 17 00:00:00 2001 From: Jiarui Li Date: Mon, 19 Jan 2026 14:41:40 +0800 Subject: [PATCH] Enhance summary output in extract_sequence_lengths script to include min, max, and percentile values for better data insights --- extract_sequence_lengths.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/extract_sequence_lengths.py b/extract_sequence_lengths.py index 6d0ae48..8165656 100644 --- a/extract_sequence_lengths.py +++ b/extract_sequence_lengths.py @@ -63,10 +63,14 @@ def main() -> None: arr = np.asarray(lengths, dtype=np.int64) print(f"Wrote: {out_csv}") - print( - "Summary: " - f"n={arr.size}, min={arr.min()}, p50={int(np.median(arr))}, mean={arr.mean():.2f}, max={arr.max()}" - ) + + percentiles = [5, 10, 25, 50, 75, 90, 95, 99] + pct_values = np.percentile(arr, percentiles) + print("Summary:") + print(f" n={arr.size}") + print(f" min={arr.min()} max={arr.max()} mean={arr.mean():.2f}") + for p, v in zip(percentiles, pct_values): + print(f" p{p:02d}={int(v)}") # Plot histogram plt.figure(figsize=(8, 5))