Getting your data in and out of vaex¶
Vaex most efficiently reads hdf5 files (column based), however other datasets may be in different formats. The most flexible way to get data into vaex is to try to open your file with TOPCAT and export it using the colfits format. Although vaex can read these column based fits files fine, because the data is stored in big endian format (instead of the now more common little endian), which can give a 30% performance penalty.
Using the command line you can convert a (col) fits file to hdf5 and visa versa
$ vaex convert file gaia-dr1.fits gaia-dr1.hdf5
Batch converting¶
Using TOPCAT, you can convert many files efficiently to a single colfits file from the command line, the following is an example of how to convert the full TGAS dataset into one colfits file
$ wget -r --no-parent http://cdn.gea.esac.esa.int/Gaia/tgas_source/fits/
$ find cdn.gea.esac.esa.int -iname '*.fits' > tgas.txt;
$ topcat -stilts -Djava.io.tmpdir=/tmp tcat in=@tgas.txt out=tgas.fits ofmt=colfits
$ vaex convert file tgas.fits tgas.hdf5 # optionally convert it to hdf5
From Python¶
Using the following methods, you can convert Pandas dataframes, ascii (whitespace or comma seperated) files, or numpy arrays to vaex datasets.
Then using the vx.export_hdf5 method to export it to a singe hdf5 file, e.g.:
import vaex as vx
import numpy as np
x = np.arange(0, 100)
ds = vx.from_arrays("test-dataset", x=x, y=x**2)
ds.export_hdf5("/tmp/test.hdf5", progress=True)
Getting your data out¶
In case you have a vaex dataset, and you want to access the underlying data, they are accessible as numpy arrays using the Dataset.columns dictionary, or by converting them to other data structures, see for instance:
Example:
import vaex as vx
import pylab as plt
ds = vx.example()
ds.select("x > -2")
values = ds.to_dict(selection=True)
plt.scatter(values["x"], values["y"])
Producing a hdf5 file¶
You may want to produce an hdf5 file from you favorite language, below are a few examples how to convert data into an hdf5 file that vaex can read.
Python example¶
This example script reads in a comma seperated values file (Example file: helmi200.csv.) and outputs it to a hdf5 file that can be read by veax. Since writing the rows individually is quite slow, the rows are written in batches.
Example file: helmi200.csv
# -*- coding: utf-8 -*-
import h5py
import sys
import numpy
h5file = h5py.File("example.hdf5", "w")
h5columns = h5file.create_group("data") # vaex reads all datasets in the columns group
csv_file = open(sys.argv[1])
# first count the lines, start at -1 since the first line is assumed to contain the column names
line_count = -1
for line in csv_file:
line_count += 1
print "file contains", line_count, "rows"
csv_file.seek(0) # start from the beginning of the file again
lines = iter(csv_file) # explicitly create an iterator over the lines
# first line should contain the column names
header = lines.next()
columns = header.strip().split(",")
print "columns", columns
# assume all values are floats
Nbatch = 10000
h5_datasets = []
numpy_arrays = []
for column_name in columns:
dataset = h5columns.create_dataset(column_name, (line_count, ), dtype='f8')
h5_datasets.append(dataset)
numpy_arrays.append(numpy.zeros((Nbatch, ), dtype='f8'))
row = 0
# we read in Nbatch lines at a time, and then write them out
for line in lines:
# convert line to a series of float values
values = map(float, line.split(","))
for i in range(len(columns)):
#h5_datasets[i][row] = values[i]
index = row-int(row/Nbatch)*Nbatch
numpy_arrays[i][index] = values[i]
if ((row % 10000) == 0) and row > 0:
print "at", row, "of", line_count
# write out the array to disk
for i in range(len(columns)):
start = (int(row/Nbatch)-1)*Nbatch
end = (int(row/Nbatch))*Nbatch
h5_datasets[i][start:end] = numpy_arrays[i][:]
row += 1
if (row % 10000) > 0:
print "writing out last part"
for i in range(len(columns)):
start = (int(row/Nbatch))*Nbatch
end = line_count
h5_datasets[i][start:end] = numpy_arrays[i][:end-start]
IDL example¶
PRINT, 'convert ascii file to hdf5'
testfile = '/Users/users/breddels/vaex/src/SubspaceFinding/data/helmi2000.asc'
h5file_id = H5F_CREATE('/tmp/test.hdf5')
N = 3300000; nr of rows
h5group_columns = H5G_CREATE(h5file_id, "data") ; for vaex, all columns should be grouped under columns
h5type_id = H5T_IDL_CREATE(1.0d) ; create double datatype
h5data_id = H5S_CREATE_SIMPLE(N)
h5_E = H5D_CREATE(h5group_columns, 'E', h5type_id, h5data_id)
h5_L = H5D_CREATE(h5group_columns, 'L', h5type_id, h5data_id)
h5_Lz = H5D_CREATE(h5group_columns, 'Lz', h5type_id, h5data_id)
dataspace = H5D_GET_SPACE(h5_E)
FREE_LUN, 1
OPENR, 1, testfile
index = 0L
WHILE NOT EOF(1) DO BEGIN
READF, 1, E,L,Lz
if (index MOD 100000) EQ 0 then begin
print, index, ' of',N
end
H5S_SELECT_HYPERSLAB, dataspace, [index], [1], stride=[1], /RESET
memory_space_id = H5S_CREATE_SIMPLE([1])
H5D_WRITE, h5_E, [E], MEMORY_SPACE_ID=memory_space_id, FILE_SPACE_ID=dataspace
H5D_WRITE, h5_L, [L], MEMORY_SPACE_ID=memory_space_id, FILE_SPACE_ID=dataspace
H5D_WRITE, h5_Lz, [Lz], MEMORY_SPACE_ID=memory_space_id, FILE_SPACE_ID=dataspace
index = index + 1
ENDWHILE
H5F_CLOSE, h5file_id
FREE_LUN, 1
end
C example¶
/*
compile as: gcc -Wall -std=c99 -o ascii_to_hdf5 ascii_to_hdf5.c -lhdf5
run as: ./ascii_to_hdf5 example.hdf5 ../../data/helmi2000-header.asc 3300000 3
arguments are: output filename, input filename, rows, columns
*/
#include "hdf5.h"
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <stdlib.h>
#include <stdarg.h>
#include <errno.h>
#include <string.h>
#define MAX_COLUMNS 512
char column_names[MAX_COLUMNS][512];
static void
check (int test, const char * message, ...)
{
if (test) {
va_list args;
va_start (args, message);
vfprintf (stderr, message, args);
va_end (args);
fprintf (stderr, "\n");
exit (EXIT_FAILURE);
}
}
int main(int argc, char *argv[])
{
hid_t file; /* Handles */
herr_t status;
haddr_t offsets[MAX_COLUMNS];
hsize_t dims[1];
char* filename_output = argv[1];
char* filename_input = argv[2];
FILE* file_input = fopen(filename_input, "r");
int no_rows = atoi(argv[3]);
int no_columns = atoi(argv[4]);
dims[0] = no_rows;
// create the file and the group 'columns', which vaex will expect
file = H5Fcreate(filename_output, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
hid_t group = H5Gcreate1(file, "data", 0);
// find the column names in the first line
for(int i=0; i<no_columns; i++) {
fscanf(file_input," %s", column_names[i]);
printf("column[%d]: %s\n", i, column_names[i]);
}
fscanf(file_input," \n");
// just create the dataspace using the HDF5 library, and ask for the offset from the beginning of the file
for(int i = 0; i < no_columns; i++) {
hid_t space = H5Screate_simple(1, dims, NULL);
hid_t dcpl = H5Pcreate (H5P_DATASET_CREATE);
H5Pset_layout (dcpl, H5D_CONTIGUOUS); // compact allows us the memory map the file
H5Pset_alloc_time(dcpl, H5D_ALLOC_TIME_EARLY); // need this to allocate the space so offset exists
hid_t dset = H5Dcreate(group, column_names[i], H5T_IEEE_F64LE, space, H5P_DEFAULT, dcpl, H5P_DEFAULT);
offsets[i] = H5Dget_offset(dset);
H5D_space_status_t space_status;
H5Dget_space_status(dset, &space_status);
printf("offset[%d] = %x allocated: %s\n", i, (unsigned int)offsets[i], (space_status == H5D_SPACE_STATUS_ALLOCATED ? "yes" : "no"));
status = H5Dclose (dset);
status = H5Pclose (dcpl);
status = H5Sclose (space);
}
//close the group and file
H5Gclose(group);
status = H5Fclose (file);
// now we can simpy memory map the file (meaning we tread the file as one big 'array'
// the offsets will tell us where we can write the columns
struct stat s;
status = stat(filename_output, &s);
check (status < 0, "stat %s failed: %s", filename_output, strerror (errno));
printf("file size: %lld\n", (unsigned long long)s.st_size);
int fd = open(filename_output, O_RDWR);
check (fd < 0, "open %s failed: %s", filename_output, strerror (errno));
// the mapped pointer points to the beginning of the file
char* mapped = mmap (0, s.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
check (mapped == MAP_FAILED, "mmap %s failed: %s",
filename_output, strerror (errno));
// read in the rows, and directly write them to the file
for(int j=0; j<no_rows; j++) {
for(int i=0; i<no_columns; i++) {
double* column_ptr = (double*)(mapped+offsets[i]);
fscanf(file_input," %lf", &column_ptr[j]);
}
if( ((j % 100000) == 0) & (j > 0) )
printf("%d of %d\n", j, no_rows);
}
printf("done!\n");
close(fd);
}