CmdStan Binary Output Format

Alternative to CSV output

Author

Implementation Report

Published

January 20, 2026

Overview

I got opus to implemented a binary output format for CmdStan. Jonah made me do it

Output Format

The binary format produces two files:

{basename}.bin — Raw 64-bit little-endian doubles, row-major order
{basename}.meta — Plain text metadata (dimensions, column names, format version)

Why this format?

R reads it with readBin(), Python with numpy.fromfile() and there are no additional dependencies required in cmdstan. It was easier to implement in two files but there’s probably a smart way to merge them. For now, lose the .meta and the .bin is useless. Cool bonus is that partial output is recoverable if Sta

Usage

./model sample data file=data.json output file=output format=bin

This creates output.bin and output.meta instead of the usual output.csv.

Some Implementation Details

You can find the full source in my fork

Binary Writer (bin_writer.hpp)

The writer class inherits from Stan’s callback interface:

class bin_writer : public stan::callbacks::writer {
 private:
  std::ofstream data_stream_;
  std::string base_path_;
  std::vector<std::string> names_;
  std::size_t num_rows_ = 0;
  std::size_t num_cols_ = 0;
  bool header_written_ = false;
  bool finalized_ = false;
  void operator()(const std::vector<double>& state) override {
    if (state.empty()) {
      return;
    }
    
    data_stream_.write(
        reinterpret_cast<const char*>(state.data()),
        static_cast<std::streamsize>(state.size() * sizeof(double)));
    
    if (!data_stream_) {
      throw std::runtime_error("bin_writer: failed to write sample data");
    }
    
    ++num_rows_;
  }
  void write_metadata() {
    std::ofstream meta(base_path_ + ".meta", std::ios::trunc);
    if (!meta) {
      throw std::runtime_error("Cannot open metadata file: " 
                               + base_path_ + ".meta");
    }

    meta << "version=1\n";
    
    meta << "rows=" << num_rows_ << "\n";
    meta << "cols=" << num_cols_ << "\n";
    
    meta << "type=float64\n";
    meta << "endian=little\n";
    meta << "order=row_major\n";
    
    for (std::size_t i = 0; i < names_.size(); ++i) {
      meta << "name." << i << "=" << names_[i] << "\n";
    }
    
    if (!meta) {
      throw std::runtime_error("Failed to write metadata file");
    }
  }

CLI Argument (arg_format.hpp)

#ifndef CMDSTAN_ARGUMENTS_ARG_FORMAT_HPP
#define CMDSTAN_ARGUMENTS_ARG_FORMAT_HPP

#include <cmdstan/arguments/singleton_argument.hpp>
#include <string>

namespace cmdstan {

class arg_format : public string_argument {
 public:
  arg_format() : string_argument() {
    _name = "format";
    _description = "Output file format";
    _validity = "csv, bin";
    _default = "csv";
    _default_value = "csv";
    _constrained = true;
    _good_value = "csv";
    _value = _default_value;
  }

  bool is_valid(std::string value) override {
    return value == "csv" || value == "bin";
  }
};

}  // namespace cmdstan

#endif  // CMDSTAN_ARGUMENTS_ARG_FORMAT_HPP

R Reader

Code

#' Read CmdStan Binary Output
#'
#' Reads binary output files produced by CmdStan with format=bin option.
#'
#' @param path Base path without extension (e.g., "output" reads output.bin + output.meta)
#' @return A matrix with named columns containing posterior samples
#' @export
#'
#' @examples
#' draws <- read_bin_stan("output")
#' draws[1:10, c("lp__", "theta.1")]
read_bin_stan <- function(path) {
  meta_path <- paste0(path, ".meta")
  bin_path <- paste0(path, ".bin")

  if (!file.exists(meta_path)) {
    stop("Metadata file not found: ", meta_path, call. = FALSE)
  }
  if (!file.exists(bin_path)) {
    stop("Binary file not found: ", bin_path, call. = FALSE)
  }

  # Parse metadata
  meta_lines <- readLines(meta_path, warn = FALSE)
  meta <- list()

  for (line in meta_lines) {
    if (nchar(trimws(line)) == 0) {
      next
    }
    if (!grepl("=", line, fixed = TRUE)) {
      next
    }

    eq_pos <- regexpr("=", line, fixed = TRUE)[1]
    key <- substr(line, 1, eq_pos - 1)
    value <- substr(line, eq_pos + 1, nchar(line))
    meta[[key]] <- value
  }

  # Validate version
  version <- as.integer(meta[["version"]])
  if (is.na(version)) {
    stop("Invalid metadata: missing version field", call. = FALSE)
  }
  if (version > 1) {
    stop(
      "Unsupported binary format version: ",
      version,
      ". Please update your reader.",
      call. = FALSE
    )
  }

  # Get dimensions
  nrow <- as.integer(meta[["rows"]])
  ncol <- as.integer(meta[["cols"]])

  if (is.na(nrow) || is.na(ncol)) {
    stop("Invalid metadata: missing or invalid rows/cols", call. = FALSE)
  }

  if (nrow == 0) {
    warning("Binary file contains 0 rows")
    mat <- matrix(numeric(0), nrow = 0, ncol = ncol)
    return(mat)
  }

  # Verify endianness (default to little if not specified)
  endian <- meta[["endian"]]
  if (is.null(endian)) {
    endian <- "little"
  }
  if (!endian %in% c("little", "big")) {
    stop("Invalid endian specification: ", endian, call. = FALSE)
  }

  # Read binary data
  expected_size <- nrow * ncol * 8
  actual_size <- file.info(bin_path)$size

  if (actual_size < expected_size) {
    warning(sprintf(
      "Binary file smaller than expected. Expected %d bytes for %d x %d matrix, got %d bytes. File may be truncated.",
      expected_size,
      nrow,
      ncol,
      actual_size
    ))
    # Adjust nrow to what we can actually read
    nrow <- floor(actual_size / (ncol * 8))
  }

  con <- file(bin_path, "rb")
  on.exit(close(con), add = TRUE)

  data <- readBin(
    con,
    what = "double",
    n = nrow * ncol,
    size = 8,
    endian = endian
  )

  if (length(data) != nrow * ncol) {
    stop(
      sprintf(
        "Data size mismatch: expected %d values, got %d",
        nrow * ncol,
        length(data)
      ),
      call. = FALSE
    )
  }

  # Reshape to matrix (R is column-major, data is row-major)
  mat <- matrix(data, nrow = nrow, ncol = ncol, byrow = TRUE)

  # Extract and apply column names
  name_keys <- grep("^name\\.[0-9]+$", names(meta), value = TRUE)

  if (length(name_keys) == ncol) {
    ordered_names <- character(ncol)
    for (key in name_keys) {
      idx <- as.integer(sub("^name\\.", "", key)) + 1L
      if (idx >= 1 && idx <= ncol) {
        ordered_names[idx] <- meta[[key]]
      }
    }
    colnames(mat) <- ordered_names
  } else if (length(name_keys) > 0) {
    warning(sprintf(
      "Column name count mismatch: %d names for %d columns",
      length(name_keys),
      ncol
    ))
  }

  mat
}


#' Read Multiple Chains from CmdStan Binary Output
#'
#' @param base_pattern Base path pattern (e.g., "output" matches output_1, output_2, ...)
#' @param chains Number of chains to read
#' @return List of matrices, one per chain
#' @export
read_bin_stan_chains <- function(base_pattern, chains) {
  result <- vector("list", chains)

  for (i in seq_len(chains)) {
    chain_path <- paste0(base_pattern, "_", i)

    if (!file.exists(paste0(chain_path, ".meta"))) {
      stop("Chain ", i, " not found: ", chain_path, ".meta", call. = FALSE)
    }

    result[[i]] <- read_bin_stan(chain_path)
  }

  names(result) <- paste0("chain_", seq_len(chains))
  result
}


#' Get Metadata from CmdStan Binary Output
#'
#' @param path Base path without extension
#' @return List containing metadata fields
#' @export
read_bin_stan_meta <- function(path) {
  meta_path <- paste0(path, ".meta")

  if (!file.exists(meta_path)) {
    stop("Metadata file not found: ", meta_path, call. = FALSE)
  }

  meta_lines <- readLines(meta_path, warn = FALSE)
  meta <- list()
  names_list <- list()

  for (line in meta_lines) {
    if (nchar(trimws(line)) == 0) {
      next
    }
    if (!grepl("=", line, fixed = TRUE)) {
      next
    }

    eq_pos <- regexpr("=", line, fixed = TRUE)[1]
    key <- substr(line, 1, eq_pos - 1)
    value <- substr(line, eq_pos + 1, nchar(line))

    if (grepl("^name\\.", key)) {
      idx <- as.integer(sub("^name\\.", "", key)) + 1L
      names_list[[idx]] <- value
    } else {
      meta[[key]] <- value
    }
  }

  # Convert numeric fields
  meta$version <- as.integer(meta$version)
  meta$rows <- as.integer(meta$rows)
  meta$cols <- as.integer(meta$cols)
  meta$names <- unlist(names_list)

  meta
}

Benchmark

Model (`stress_test`)

// Simple model for I/O benchmarking - we only care about output size, not sampling
data {
  int<lower=1> N;
}
parameters {
  vector[N] theta;
}
model {
  theta ~ std_normal();
}

Setup

Code

library(tidyverse)
library(ggdist)

# Configuration
NUM_RUNS <- 25
PARAM_SIZES <- c(100, 1000, 10000)
NUM_SAMPLES <- 1000
NUM_WARMUP <- 500
MODEL <- "./stress_test"

# Helper functions
run_model <- function(output_file, format, n_params) {
  data_file <- tempfile(fileext = ".json")
  writeLines(sprintf('{"N": %d}', n_params), data_file)

  args <- c(
    "sample",
    paste0("num_samples=", NUM_SAMPLES),
    paste0("num_warmup=", NUM_WARMUP),
    paste0("data file=", data_file),
    "output",
    paste0("file=", output_file)
  )

  if (format == "bin") {
    args <- c(args, "format=bin")
  }

  system2(MODEL, args, stdout = FALSE, stderr = FALSE)
  unlink(data_file)
}

read_csv_stan <- function(file) {
  read.csv(file, comment.char = "#")
}

Results

Code

results <- tibble()

for (n_params in PARAM_SIZES) {
  for (run in 1:NUM_RUNS) {
    # CSV benchmark
    csv_file <- tempfile(fileext = ".csv")
    csv_write_time <- system.time({
      run_model(csv_file, "csv", n_params)
    })["elapsed"]

    csv_read_time <- system.time({
      csv_data <- read_csv_stan(csv_file)
    })["elapsed"]

    csv_size <- file.size(csv_file)

    # Binary benchmark
    bin_base <- tempfile()
    bin_write_time <- system.time({
      run_model(bin_base, "bin", n_params)
    })["elapsed"]

    bin_read_time <- system.time({
      bin_data <- read_bin_stan(bin_base)
    })["elapsed"]

    bin_size <- file.size(paste0(bin_base, ".bin")) +
                file.size(paste0(bin_base, ".meta"))

    # Record results
    results <- bind_rows(results, tibble(
      params = n_params,
      run = run,
      format = c("CSV", "Binary"),
      write_time = c(csv_write_time, bin_write_time),
      read_time = c(csv_read_time, bin_read_time),
      file_size = c(csv_size, bin_size)
    ))

    # Cleanup
    unlink(csv_file)
    unlink(paste0(bin_base, ".bin"))
    unlink(paste0(bin_base, ".meta"))
  }
}

Write Performance (Including Sampling)

Code

results |>
  ggplot(aes(x = format, y = write_time, fill = format)) +
  stat_halfeye(adjust = 0.5, width = 0.6, .width = 0, justification = -0.2, point_colour = NA) +
  geom_boxplot(width = 0.15, outlier.shape = NA, alpha = 0.5) +
  geom_jitter(width = 0.05, alpha = 0.5, size = 1) +
  facet_wrap(~params, scales = "free_y", labeller = labeller(params = function(x) paste(x, "params"))) +
  labs(
    title = "Write performance (including sampling)",
    subtitle = sprintf("%d samples, %d warmup, %d runs each", NUM_SAMPLES, NUM_WARMUP, NUM_RUNS),
    x = NULL,
    y = "Time (seconds)"
  ) +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_manual(values = c("CSV" = "#E69F00", "Binary" = "#56B4E9"))

Read Performance

Code

results |>
  ggplot(aes(x = format, y = read_time, fill = format)) +
  stat_halfeye(adjust = 0.5, width = 0.6, .width = 0, justification = -0.2, point_colour = NA) +
  geom_boxplot(width = 0.15, outlier.shape = NA, alpha = 0.5) +
  geom_jitter(width = 0.05, alpha = 0.5, size = 1) +
  facet_wrap(~params, scales = "free_y", labeller = labeller(params = function(x) paste(x, "params"))) +
  labs(
    title = "Read performance",
    subtitle = "Time to load output into R",
    x = NULL,
    y = "Time (seconds)"
  ) +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_manual(values = c("CSV" = "#E69F00", "Binary" = "#56B4E9"))

File Size Comparison

Code

results |>
  summarise(.by = c(params, format), size_mb = mean(file_size) / 1e6) |>
  ggplot(aes(x = format, y = size_mb, fill = format)) +
  geom_col(width = 0.6) +
  facet_wrap(~params, scales = "free_y", labeller = labeller(params = function(x) paste(x, "params"))) +
  labs(
    title = "File size comparison",
    x = NULL,
    y = "File Size (MB)"
  ) +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_manual(values = c("CSV" = "#E69F00", "Binary" = "#56B4E9"))

Summary

Code

speedup_summary <- results |>
  pivot_wider(
    id_cols = c(params, run),
    names_from = format,
    values_from = c(write_time, read_time, file_size)
  ) |>
  summarise(
    .by = params,
    write_speedup = sprintf("%.2f (%.2f)",
                            mean(write_time_CSV / write_time_Binary),
                            sd(write_time_CSV / write_time_Binary)),
    read_speedup = sprintf("%.1f (%.1f)",
                           mean(read_time_CSV / read_time_Binary),
                           sd(read_time_CSV / read_time_Binary)),
    size_ratio = sprintf("%.2f", mean(file_size_CSV / file_size_Binary))
  )

speedup_summary |>
  knitr::kable(
    col.names = c("Parameters", "Write Speedup", "Read Speedup", "Size Ratio"),
    caption = "Performance comparison: Binary vs CSV. Values shown as mean (sd)."
  )

Performance comparison: Binary vs CSV. Values shown as mean (sd).
Parameters	Write Speedup	Read Speedup	Size Ratio
100	1.66 (0.27)	9.3 (2.3)	1.36
1000	1.44 (0.09)	16.4 (2.5)	1.39
10000	1.32 (0.23)	10.2 (2.1)	1.39

Data Integrity Verification

Code

# Run one comparison with same seed
data_file <- tempfile(fileext = ".json")
writeLines('{"N": 100}', data_file)

csv_file <- tempfile(fileext = ".csv")
bin_base <- tempfile()

system2(MODEL, c(
  "sample",
  "num_samples=1000",
  "num_warmup=500",
  "random seed=42",
  paste0("data file=", data_file),
  paste0("output file=", csv_file)
), stdout = FALSE, stderr = FALSE)

system2(MODEL, c(
  "sample",
  "num_samples=1000",
  "num_warmup=500",
  "random seed=42",
  paste0("data file=", data_file),
  "output",
  paste0("file=", bin_base),
  "format=bin"
), stdout = FALSE, stderr = FALSE)

csv_data <- read_csv_stan(csv_file)
bin_data <- read_bin_stan(bin_base)

max_diff <- max(abs(as.matrix(csv_data) - bin_data))

cat(sprintf("Maximum difference between CSV and binary output: %.2e\n", max_diff))

Maximum difference between CSV and binary output: 4.99e-06

Code

cat(sprintf("Dimensions match: %s\n", identical(dim(csv_data), dim(bin_data))))

Dimensions match: TRUE

Code

cat(sprintf("Column names match: %s\n",
            all(colnames(csv_data) == colnames(bin_data))))

Column names match: TRUE

Code

# Scatterplot of relative differences
diff_data <- tibble(
  csv_value = as.vector(as.matrix(csv_data)),
  bin_value = as.vector(bin_data)
) |>
  mutate(
    rel_diff = if_else(csv_value != 0, (csv_value - bin_value) / abs(csv_value), 0)
  )

max_rel_diff <- max(abs(diff_data$rel_diff))

diff_data |>
  sample_n(min(10000, nrow(diff_data))) |>
  ggplot(aes(x = csv_value, y = rel_diff)) +
  geom_point(alpha = 0.3, size = 0.5) +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  labs(
    title = "Relative differences between CSV and binary output",
    subtitle = sprintf("Max relative difference: %.2e", max_rel_diff),
    x = "Value (from CSV)",
    y = "Relative difference (CSV - Binary) / |CSV|"
  ) +
  theme_minimal()

Conclusions

Just do it. :*

Overview

Output Format

Why this format?

Usage

Some Implementation Details

Binary Writer (bin_writer.hpp)

CLI Argument (arg_format.hpp)

R Reader

Benchmark

Model (stress_test)

Setup

Results

Write Performance (Including Sampling)

Read Performance

File Size Comparison

Summary

Data Integrity Verification

Conclusions

Model (`stress_test`)