Enhance summary output in extract_sequence_lengths script to include min, max, and percentile values for better data insights

This commit is contained in:
2026-01-19 14:41:40 +08:00
parent 29913106cb
commit ea4682c35a

View File

@@ -63,10 +63,14 @@ def main() -> None:
arr = np.asarray(lengths, dtype=np.int64)
print(f"Wrote: {out_csv}")
print(
"Summary: "
f"n={arr.size}, min={arr.min()}, p50={int(np.median(arr))}, mean={arr.mean():.2f}, max={arr.max()}"
)
percentiles = [5, 10, 25, 50, 75, 90, 95, 99]
pct_values = np.percentile(arr, percentiles)
print("Summary:")
print(f" n={arr.size}")
print(f" min={arr.min()} max={arr.max()} mean={arr.mean():.2f}")
for p, v in zip(percentiles, pct_values):
print(f" p{p:02d}={int(v)}")
# Plot histogram
plt.figure(figsize=(8, 5))