Getting your data in and out of vaex

Vaex most efficiently reads hdf5 files (column based), however other datasets may be in different formats. The most flexible way to get data into vaex is to try to open your file with TOPCAT and export it using the colfits format. Although vaex can read these column based fits files fine, because the data is stored in big endian format (instead of the now more common little endian), which can give a 30% performance penalty.

Using the command line you can convert a (col) fits file to hdf5 and visa versa

$ vaex convert file gaia-dr1.fits gaia-dr1.hdf5

Batch converting

Using TOPCAT, you can convert many files efficiently to a single colfits file from the command line, the following is an example of how to convert the full TGAS dataset into one colfits file

$ wget -r --no-parent http://cdn.gea.esac.esa.int/Gaia/tgas_source/fits/
$ find cdn.gea.esac.esa.int -iname '*.fits' > tgas.txt;
$ topcat -stilts -Djava.io.tmpdir=/tmp tcat in=@tgas.txt out=tgas.fits ofmt=colfits
$ vaex convert file tgas.fits tgas.hdf5 # optionally convert it to hdf5

From Python

Using the following methods, you can convert Pandas dataframes, ascii (whitespace or comma seperated) files, or numpy arrays to vaex datasets.

Then using the vx.export_hdf5 method to export it to a singe hdf5 file, e.g.:

import vaex as vx
import numpy as np
x = np.arange(0, 100)
ds = vx.from_arrays("test-dataset", x=x, y=x**2)
ds.export_hdf5("/tmp/test.hdf5", progress=True)

Getting your data out

In case you have a vaex dataset, and you want to access the underlying data, they are accessible as numpy arrays using the Dataset.columns dictionary, or by converting them to other data structures, see for instance:

Example:

import vaex as vx
import pylab as plt
ds = vx.example()
ds.select("x > -2")
values = ds.to_dict(selection=True)
plt.scatter(values["x"], values["y"])

Producing a hdf5 file

You may want to produce an hdf5 file from you favorite language, below are a few examples how to convert data into an hdf5 file that vaex can read.

Python example

This example script reads in a comma seperated values file (Example file: helmi200.csv.) and outputs it to a hdf5 file that can be read by veax. Since writing the rows individually is quite slow, the rows are written in batches.

Example file: helmi200.csv

# -*- coding: utf-8 -*-
import h5py
import sys
import numpy

h5file = h5py.File("example.hdf5", "w")

h5columns = h5file.create_group("data") # vaex reads all datasets in the columns group

csv_file = open(sys.argv[1])

# first count the lines, start at -1 since the first line is assumed to contain the column names
line_count = -1
for line in csv_file:
	line_count += 1

print "file contains", line_count, "rows"

csv_file.seek(0) # start from the beginning of the file again
lines = iter(csv_file) # explicitly create an iterator over the lines

# first line should contain the column names
header = lines.next()
columns = header.strip().split(",")
print "columns", columns

# assume all values are floats
Nbatch = 10000
h5_datasets = []
numpy_arrays = []
for column_name in columns:
	dataset = h5columns.create_dataset(column_name, (line_count, ), dtype='f8')
	h5_datasets.append(dataset)
	numpy_arrays.append(numpy.zeros((Nbatch, ), dtype='f8'))

row = 0
# we read in Nbatch lines at a time, and then write them out
for line in lines:
	# convert line to a series of float values
	values = map(float, line.split(","))
	for i in range(len(columns)):
		#h5_datasets[i][row] = values[i]
		index = row-int(row/Nbatch)*Nbatch
		numpy_arrays[i][index] = values[i]
	if ((row % 10000) == 0) and row > 0:
		print "at", row, "of", line_count
		# write out the array to disk
		for i in range(len(columns)):
			start = (int(row/Nbatch)-1)*Nbatch
			end = (int(row/Nbatch))*Nbatch
			h5_datasets[i][start:end] = numpy_arrays[i][:]
	row += 1
	
if (row % 10000) > 0:
	print "writing out last part"
	for i in range(len(columns)):
		start = (int(row/Nbatch))*Nbatch
		end = line_count
		h5_datasets[i][start:end] = numpy_arrays[i][:end-start]

IDL example

PRINT, 'convert ascii file to hdf5'
testfile = '/Users/users/breddels/vaex/src/SubspaceFinding/data/helmi2000.asc'
h5file_id = H5F_CREATE('/tmp/test.hdf5')

N = 3300000; nr of rows

h5group_columns = H5G_CREATE(h5file_id, "data") ; for vaex, all columns should be grouped under columns
h5type_id = H5T_IDL_CREATE(1.0d) ; create double datatype
h5data_id = H5S_CREATE_SIMPLE(N)

h5_E = H5D_CREATE(h5group_columns, 'E', h5type_id, h5data_id)
h5_L = H5D_CREATE(h5group_columns, 'L', h5type_id, h5data_id)
h5_Lz = H5D_CREATE(h5group_columns, 'Lz', h5type_id, h5data_id)

dataspace = H5D_GET_SPACE(h5_E)



FREE_LUN, 1
OPENR, 1, testfile

index = 0L
WHILE NOT EOF(1) DO BEGIN
  READF, 1, E,L,Lz
  if (index MOD 100000) EQ 0 then  begin
    print, index, ' of',N 
  end
  H5S_SELECT_HYPERSLAB, dataspace, [index], [1], stride=[1], /RESET
  memory_space_id = H5S_CREATE_SIMPLE([1])
  H5D_WRITE, h5_E, [E], MEMORY_SPACE_ID=memory_space_id,  FILE_SPACE_ID=dataspace
  H5D_WRITE, h5_L, [L], MEMORY_SPACE_ID=memory_space_id,  FILE_SPACE_ID=dataspace
  H5D_WRITE, h5_Lz, [Lz], MEMORY_SPACE_ID=memory_space_id,  FILE_SPACE_ID=dataspace
  index = index + 1
ENDWHILE

H5F_CLOSE, h5file_id
FREE_LUN, 1
  
end

C example

/*
compile as: gcc -Wall -std=c99 -o ascii_to_hdf5 ascii_to_hdf5.c -lhdf5
run as: ./ascii_to_hdf5 example.hdf5 ../../data/helmi2000-header.asc 3300000 3
	arguments are: output filename, input filename, rows, columns

*/
#include "hdf5.h"
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h> 
#include <stdlib.h>
#include <stdarg.h>
#include <errno.h>
#include <string.h>
#define MAX_COLUMNS 512

char column_names[MAX_COLUMNS][512];


static void
check (int test, const char * message, ...)
{
	if (test) {
		va_list args;
		va_start (args, message);
		vfprintf (stderr, message, args);
		va_end (args);
		fprintf (stderr, "\n");
		exit (EXIT_FAILURE);
	}
}

int main(int argc, char *argv[])
{
	hid_t		file;    /* Handles */
	herr_t		status;
	haddr_t		offsets[MAX_COLUMNS];
	hsize_t		dims[1];
	
	char* filename_output = argv[1];
	char* filename_input = argv[2];
	FILE* file_input = fopen(filename_input, "r");

	int no_rows = atoi(argv[3]);
	int no_columns = atoi(argv[4]);
	dims[0] = no_rows;
				

	
	// create the file and the group 'columns', which vaex will expect
	file = H5Fcreate(filename_output, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
	hid_t group = H5Gcreate1(file, "data", 0);
	
	// find the column names in the first line
	for(int i=0; i<no_columns; i++) {
		fscanf(file_input," %s", column_names[i]);
		printf("column[%d]: %s\n", i, column_names[i]);
	}
	fscanf(file_input," \n");

	// just create the dataspace using the HDF5 library, and ask for the offset from the beginning of the file
	for(int i = 0; i < no_columns; i++)  {
		hid_t space = H5Screate_simple(1, dims, NULL);

		
		hid_t dcpl = H5Pcreate (H5P_DATASET_CREATE);
		H5Pset_layout (dcpl, H5D_CONTIGUOUS); // compact allows us the memory map the file
		H5Pset_alloc_time(dcpl, H5D_ALLOC_TIME_EARLY); // need this to allocate the space so offset exists
		hid_t dset = H5Dcreate(group, column_names[i], H5T_IEEE_F64LE, space, H5P_DEFAULT, dcpl, H5P_DEFAULT);
		
		offsets[i] = H5Dget_offset(dset);
		
		H5D_space_status_t space_status;
		H5Dget_space_status(dset, &space_status);
		printf("offset[%d] = %x allocated: %s\n", i, (unsigned int)offsets[i], (space_status == H5D_SPACE_STATUS_ALLOCATED ? "yes" : "no"));

		status = H5Dclose (dset);
		status = H5Pclose (dcpl);
		status = H5Sclose (space);
	}
	//close the group and file
	H5Gclose(group);
	status = H5Fclose (file);
	
	
	// now we can simpy memory map the file (meaning we tread the file as one big 'array'
	// the offsets will tell us where we can write the columns
	
	struct stat s;
	status = stat(filename_output,  &s);
	check (status < 0, "stat %s failed: %s", filename_output, strerror (errno));
	printf("file size: %lld\n", (unsigned long long)s.st_size);
	
	int fd = open(filename_output, O_RDWR);
	check (fd < 0, "open %s failed: %s", filename_output, strerror (errno));
	
    
	// the mapped pointer points to the beginning of the file
	char* mapped = mmap (0, s.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
	check (mapped == MAP_FAILED, "mmap %s failed: %s",
           filename_output, strerror (errno));

	// read in the rows, and directly write them to the file
	for(int j=0; j<no_rows; j++) {
		for(int i=0; i<no_columns; i++) {
			double* column_ptr = (double*)(mapped+offsets[i]);
			fscanf(file_input," %lf", &column_ptr[j]);
		}
		if( ((j % 100000) == 0) & (j > 0) )
			printf("%d of %d\n", j, no_rows);
	}
	printf("done!\n");
	close(fd);
}