Objectives

  1. Understand chunking in dask array
  2. Different ways of chunking
import dask.array as da
import numpy as np

Chunked Dask array

a = da.arange(6*6).reshape((6, 6))
a
                <tr>
                    <th> Bytes </th>
                    <td> 288 B </td>
                    <td> 288 B </td>
                </tr>

                <tr>
                    <th> Shape </th>
                    <td> (6, 6) </td>
                    <td> (6, 6) </td>
                </tr>
                <tr>
                    <th> Dask graph </th>
                    <td colspan="2"> 1 chunks in 2 graph layers </td>
                </tr>
                <tr>
                    <th> Data type </th>
                    <td colspan="2"> int64 numpy.ndarray </td>
                </tr>
            </tbody>
        </table>
    </td>
    <td>
    <svg width="170" height="170" style="stroke:rgb(0,0,0);stroke-width:1" >

6 6

Array Chunk
a.chunks
((6,), (6,))

Rechunk

a = a.rechunk({0: 2, 1: 3})
a
                <tr>
                    <th> Bytes </th>
                    <td> 288 B </td>
                    <td> 48 B </td>
                </tr>

                <tr>
                    <th> Shape </th>
                    <td> (6, 6) </td>
                    <td> (2, 3) </td>
                </tr>
                <tr>
                    <th> Dask graph </th>
                    <td colspan="2"> 6 chunks in 3 graph layers </td>
                </tr>
                <tr>
                    <th> Data type </th>
                    <td colspan="2"> int64 numpy.ndarray </td>
                </tr>
            </tbody>
        </table>
    </td>
    <td>
    <svg width="170" height="170" style="stroke:rgb(0,0,0);stroke-width:1" >

6 6

Array Chunk
b = a[a > 2]
b
                <tr>
                    <th> Bytes </th>
                    <td> unknown </td>
                    <td> unknown </td>
                </tr>

                <tr>
                    <th> Shape </th>
                    <td> (nan,) </td>
                    <td> (nan,) </td>
                </tr>
                <tr>
                    <th> Dask graph </th>
                    <td colspan="2"> 6 chunks in 10 graph layers </td>
                </tr>
                <tr>
                    <th> Data type </th>
                    <td colspan="2"> int64 numpy.ndarray </td>
                </tr>
            </tbody>
        </table>
    </td>
    <td>

    </td>
</tr>
Array Chunk
b.compute_chunk_sizes()
                <tr>
                    <th> Bytes </th>
                    <td> 264 B </td>
                    <td> 48 B </td>
                </tr>

                <tr>
                    <th> Shape </th>
                    <td> (33,) </td>
                    <td> (6,) </td>
                </tr>
                <tr>
                    <th> Dask graph </th>
                    <td colspan="2"> 6 chunks in 10 graph layers </td>
                </tr>
                <tr>
                    <th> Data type </th>
                    <td colspan="2"> int64 numpy.ndarray </td>
                </tr>
            </tbody>
        </table>
    </td>
    <td>
    <svg width="170" height="81" style="stroke:rgb(0,0,0);stroke-width:1" >

33 1

Array Chunk

Reshape

a = da.arange(3*4).reshape((3, 4)).rechunk(((2, 1), (2, 2)))
a
                <tr>
                    <th> Bytes </th>
                    <td> 96 B </td>
                    <td> 32 B </td>
                </tr>

                <tr>
                    <th> Shape </th>
                    <td> (3, 4) </td>
                    <td> (2, 2) </td>
                </tr>
                <tr>
                    <th> Dask graph </th>
                    <td colspan="2"> 4 chunks in 3 graph layers </td>
                </tr>
                <tr>
                    <th> Data type </th>
                    <td colspan="2"> int64 numpy.ndarray </td>
                </tr>
            </tbody>
        </table>
    </td>
    <td>
    <svg width="170" height="140" style="stroke:rgb(0,0,0);stroke-width:1" >

4 3

Array Chunk
a.compute()
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
a.reshape((3, 2, 2)).compute()
array([[[ 0,  1],
        [ 2,  3]],

       [[ 4,  5],
        [ 6,  7]],

       [[ 8,  9],
        [10, 11]]])
a.reshape(12) # does the merge of the chunks
                <tr>
                    <th> Bytes </th>
                    <td> 96 B </td>
                    <td> 32 B </td>
                </tr>

                <tr>
                    <th> Shape </th>
                    <td> (12,) </td>
                    <td> (4,) </td>
                </tr>
                <tr>
                    <th> Dask graph </th>
                    <td colspan="2"> 3 chunks in 5 graph layers </td>
                </tr>
                <tr>
                    <th> Data type </th>
                    <td colspan="2"> int64 numpy.ndarray </td>
                </tr>
            </tbody>
        </table>
    </td>
    <td>
    <svg width="170" height="87" style="stroke:rgb(0,0,0);stroke-width:1" >

12 1

Array Chunk
a.reshape(12, merge_chunks=False) # does not merge but converts the slow-axis to of chunk (1,)
                <tr>
                    <th> Bytes </th>
                    <td> 96 B </td>
                    <td> 16 B </td>
                </tr>

                <tr>
                    <th> Shape </th>
                    <td> (12,) </td>
                    <td> (2,) </td>
                </tr>
                <tr>
                    <th> Dask graph </th>
                    <td colspan="2"> 6 chunks in 5 graph layers </td>
                </tr>
                <tr>
                    <th> Data type </th>
                    <td colspan="2"> int64 numpy.ndarray </td>
                </tr>
            </tbody>
        </table>
    </td>
    <td>
    <svg width="170" height="87" style="stroke:rgb(0,0,0);stroke-width:1" >

12 1

Array Chunk
da.ones((3, 4), chunks=(2, 2)).rechunk(((1, 1, 1), (2, 2)))
                <tr>
                    <th> Bytes </th>
                    <td> 96 B </td>
                    <td> 16 B </td>
                </tr>

                <tr>
                    <th> Shape </th>
                    <td> (3, 4) </td>
                    <td> (1, 2) </td>
                </tr>
                <tr>
                    <th> Dask graph </th>
                    <td colspan="2"> 6 chunks in 2 graph layers </td>
                </tr>
                <tr>
                    <th> Data type </th>
                    <td colspan="2"> float64 numpy.ndarray </td>
                </tr>
            </tbody>
        </table>
    </td>
    <td>
    <svg width="170" height="140" style="stroke:rgb(0,0,0);stroke-width:1" >

4 3

Array Chunk
da.from_array(np.arange(24).reshape(2, 3, 4), chunks=((2,), (2, 1), (2, 2)))
                <tr>
                    <th> Bytes </th>
                    <td> 192 B </td>
                    <td> 64 B </td>
                </tr>

                <tr>
                    <th> Shape </th>
                    <td> (2, 3, 4) </td>
                    <td> (2, 2, 2) </td>
                </tr>
                <tr>
                    <th> Dask graph </th>
                    <td colspan="2"> 4 chunks in 1 graph layer </td>
                </tr>
                <tr>
                    <th> Data type </th>
                    <td colspan="2"> int64 numpy.ndarray </td>
                </tr>
            </tbody>
        </table>
    </td>
    <td>
    <svg width="215" height="175" style="stroke:rgb(0,0,0);stroke-width:1" >

4 3 2

Array Chunk