Dask

If you want to try out this notebook with a live Python kernel, use mybinder:

https://mybinder.org/badge_logo.svg

Dask.array

A vaex dataframe can be lazily converted to a dask.array using DataFrame.to_dask_array.

[2]:
import vaex
df = vaex.example()
df
[2]:
# x y z vx vy vz E L Lz FeH
0 -0.7774707672.10626292 1.93743467 53.276722 288.386047 -95.2649078-121238.171875 831.0799560546875 -336.426513671875 -2.309227609164518
1 3.77427316 2.23387194 3.76209331 252.810791 -69.9498444-56.3121033-100819.91406251435.1839599609375-828.7567749023438 -1.788735491591229
2 1.3757627 -6.3283844 2.63250017 96.276474 226.440201 -34.7527161-100559.96093751039.2989501953125920.802490234375 -0.7618109022478798
3 -7.06737804 1.31737781 -6.10543537 204.968842 -205.679016-58.9777031-70174.8515625 2441.724853515625 1183.5899658203125 -1.5208778422936413
4 0.243441463 -0.822781682-0.206593871-311.742371-238.41217 186.824127 -144138.75 374.8164367675781 -314.5353088378906 -2.655341358427361
... ... ... ... ... ... ... ... ... ... ...
329,9953.76883793 4.66251659 -4.42904139 107.432999 -2.1377129617.5130272 -119687.3203125746.8833618164062 -508.96484375 -1.6499842518381402
329,9969.17409325 -8.87091351 -8.61707687 32.0 108.089264 179.060638 -68933.8046875 2395.633056640625 1275.490234375 -1.4336036247720836
329,997-1.14041007 -8.4957695 2.25749826 8.46711349 -38.2765236-127.541473-112580.359375 1182.436279296875 115.58557891845703 -1.9306227597361942
329,998-14.2985935 -5.51750422 -8.65472317 110.221558 -31.392559186.2726822 -74862.90625 1324.59265136718751057.017333984375 -1.225019818838568
329,99910.5450506 -8.86106777 -4.65835428 -2.10541415-27.61088563.80799961 -95361.765625 351.0955505371094 -309.81439208984375-2.5689636894079477
[10]:
# convert a set of columns in the dataframe to a 2d dask array
A = df[['x', 'y', 'z']].to_dask_array()
A
[10]:
Array Chunk
Bytes 7.92 MB 7.92 MB
Shape (330000, 3) (330000, 3)
Count 2 Tasks 1 Chunks
Type float64 numpy.ndarray
3 330000
[11]:
import dask.array as da
# lazily compute with dask
r = da.sqrt(A[:,0]**2 + A[:,1]**2 + A[:,2]**2)
r
[11]:
Array Chunk
Bytes 2.64 MB 2.64 MB
Shape (330000,) (330000,)
Count 11 Tasks 1 Chunks
Type float64 numpy.ndarray
330000 1
[12]:
# materialize the data
r_computed = r.compute()
r_computed
[15]:
# put it back in the dataframe
df['r'] = r_computed
df
[15]:
# x y z vx vy vz E L Lz FeH r
0 -0.7774707672.10626292 1.93743467 53.276722 288.386047 -95.2649078-121238.171875 831.0799560546875 -336.426513671875 -2.309227609164518 2.9655450396553587
1 3.77427316 2.23387194 3.76209331 252.810791 -69.9498444-56.3121033-100819.91406251435.1839599609375-828.7567749023438 -1.788735491591229 5.77829281049018
2 1.3757627 -6.3283844 2.63250017 96.276474 226.440201 -34.7527161-100559.96093751039.2989501953125920.802490234375 -0.76181090224787986.99079603950256
3 -7.06737804 1.31737781 -6.10543537 204.968842 -205.679016-58.9777031-70174.8515625 2441.724853515625 1183.5899658203125 -1.52087784229364139.431842752707537
4 0.243441463 -0.822781682-0.206593871-311.742371-238.41217 186.824127 -144138.75 374.8164367675781 -314.5353088378906 -2.655341358427361 0.8825613121347967
... ... ... ... ... ... ... ... ... ... ... ...
329,9953.76883793 4.66251659 -4.42904139 107.432999 -2.1377129617.5130272 -119687.3203125746.8833618164062 -508.96484375 -1.64998425183814027.453831761514681
329,9969.17409325 -8.87091351 -8.61707687 32.0 108.089264 179.060638 -68933.8046875 2395.633056640625 1275.490234375 -1.433603624772083615.398412491068198
329,997-1.14041007 -8.4957695 2.25749826 8.46711349 -38.2765236-127.541473-112580.359375 1182.436279296875 115.58557891845703 -1.93062275973619428.864250273925633
329,998-14.2985935 -5.51750422 -8.65472317 110.221558 -31.392559186.2726822 -74862.90625 1324.59265136718751057.017333984375 -1.225019818838568 17.601047186042507
329,99910.5450506 -8.86106777 -4.65835428 -2.10541415-27.61088563.80799961 -95361.765625 351.0955505371094 -309.81439208984375-2.568963689407947714.540181524970293
[ ]: