diff --git a/docs/gen_docs.py b/docs/gen_docs.py
index 2071749..4a824e8 100644
--- a/docs/gen_docs.py
+++ b/docs/gen_docs.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
-import sys
 import os
+from markdown_toclify import markdown_toclify
 
 root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 src_dir = root_dir + "/include/tim/vx/ops"
@@ -73,4 +73,7 @@ for index, line in enumerate(lines):
 with open(root_dir + md_file, mode='w',newline='\n', encoding='UTF-8') as fhndl:
     fhndl.writelines(new_lines)
 
-print(root_dir)
+cont = markdown_toclify(input_file=root_dir + md_file)
+
+with open(root_dir + md_file, mode='w',newline='\n', encoding='UTF-8') as fhndl:
+    fhndl.write(cont)
diff --git a/include/tim/vx/ops/activations.h b/include/tim/vx/ops/activations.h
index b1ad90b..e8f8f44 100644
--- a/include/tim/vx/ops/activations.h
+++ b/include/tim/vx/ops/activations.h
@@ -58,7 +58,7 @@ namespace ops {
  *   LeakyRelu(x)           : alpha * x if x <= 0; x if x > 0. alpha is a scalar.
  *
  *   Prelu(x)               : alpha * x if x <= 0; x if x > 0. alpha is a tensor.
- *    - axis                : Describes the axis of the inputs when coerced to 2D.
+ *    - axis                : describes the axis of the inputs when coerced to 2D.
  * ```
  */
 
diff --git a/include/tim/vx/ops/batch2space.h b/include/tim/vx/ops/batch2space.h
index 8974301..576420e 100644
--- a/include/tim/vx/ops/batch2space.h
+++ b/include/tim/vx/ops/batch2space.h
@@ -38,7 +38,7 @@ namespace ops {
  * This operation reshapes the batch dimension (dimension 0) into M + 1 dimensions
  * of shape **block_size** + [batch], interleaves these blocks back into the grid
  * defined by the spatial dimensions [1, ..., M], to obtain a result with the same
- * rank as the input.
+ * rank as the input. This is the reverse transformation of Space2Batch.
  *
  * - crop : corp the output tensor for ROI usage.
  */
diff --git a/include/tim/vx/ops/concat.h b/include/tim/vx/ops/concat.h
index db947a6..2ed5dcd 100644
--- a/include/tim/vx/ops/concat.h
+++ b/include/tim/vx/ops/concat.h
@@ -33,6 +33,7 @@ namespace ops {
  * ## Concat
  *
  * Concatenate a list of tensors into a single tensor.
+ *
  * - axis : Which axis to concat on.
  */
 
diff --git a/include/tim/vx/ops/conv2d.h b/include/tim/vx/ops/conv2d.h
index d073445..fb5dec5 100644
--- a/include/tim/vx/ops/conv2d.h
+++ b/include/tim/vx/ops/conv2d.h
@@ -38,7 +38,7 @@ namespace ops {
  * Performs a 2-D convolution operation, include classic Conv2D /
  * Depthwise Conv2D / Group Conv2D / Dilation Conv2D.
  *
- * - weights : the channel number for weight tensor.
+ * - weights : the output channel number for weight tensor.
  * - ksize : the height and width for weight tensor.
  * - padding : AUTO, VALID or SAME.
  * - pad : pad value for each spatial axis. 
diff --git a/include/tim/vx/ops/depth2space.h b/include/tim/vx/ops/depth2space.h
index be2a6d1..fc5e8b4 100644
--- a/include/tim/vx/ops/depth2space.h
+++ b/include/tim/vx/ops/depth2space.h
@@ -29,6 +29,22 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## DepthToSpace
+ *
+ * DepthToSpace rearranges (permutes) data from depth into blocks of spatial data.
+ * This is the reverse transformation of SpaceToDepth.
+ *
+ * Chunks of data of size block_size * block_size from depth are rearranged into
+ * non-overlapping blocks of size block_size x block_size.
+ *
+ * The width of the output tensor is input_depth * block_size, whereas the height
+ * is input_height * block_size. The depth of the input tensor must be divisible
+ * by block_size * block_size
+ *
+ * - crop : corp the output tensor for ROI usage.
+ */
+
 class DepthToSpace : public Operation {
  public:
   DepthToSpace(Graph* Graph, int block_size,
diff --git a/include/tim/vx/ops/dropout.h b/include/tim/vx/ops/dropout.h
index 9e84ef1..b4e4d3a 100644
--- a/include/tim/vx/ops/dropout.h
+++ b/include/tim/vx/ops/dropout.h
@@ -30,6 +30,15 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## Dropout
+ *
+ * The Dropout layer randomly sets input units to 0 with a frequency of rate at
+ * each step during training time, which helps prevent overfitting.
+ *
+ * TIM-VX only focus on inference time, and just scaling input tensor by **ratio**
+ * for Dropout operator.
+ */
 
 class Dropout : public Operation {
   public:
diff --git a/include/tim/vx/ops/elementwise.h b/include/tim/vx/ops/elementwise.h
index 17a8cde..2dbe7ad 100644
--- a/include/tim/vx/ops/elementwise.h
+++ b/include/tim/vx/ops/elementwise.h
@@ -29,6 +29,39 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## Add
+ *
+ * Add(x, y) : x + y. This operation supports broadcasting.
+ *
+ * ## Sub
+ *
+ * Sub(x, y) : x - y. This operation supports broadcasting.
+ *
+ * ## Multiply
+ *
+ * Multiply(x, y) : Multiplies two tensors, element-wise, also known as Hadamard
+ * product. This operation supports broadcasting.
+ *
+ * - scale: scaling the product.
+ *
+ * ## Div
+ *
+ * Div(x, y) : x / y. This operation supports broadcasting.
+ *
+ * ## Pow
+ *
+ * Pow(x, y) : x ^ y. This operation supports broadcasting.
+ *
+ * ## Minimum
+ *
+ * Minimum(x, y) : min(x, y). This operation supports broadcasting.
+ *
+ * ## Maximum
+ *
+ * Maximum(x, y) : max(x, y). This operation supports broadcasting.
+ */
+
 #define DECLARE_ELEMENTWISE_OP(NAME) \
   class NAME : public Operation {    \
    public:                           \
diff --git a/include/tim/vx/ops/fullyconnected.h b/include/tim/vx/ops/fullyconnected.h
index 877982a..7bee49f 100644
--- a/include/tim/vx/ops/fullyconnected.h
+++ b/include/tim/vx/ops/fullyconnected.h
@@ -28,6 +28,17 @@
 namespace tim {
 namespace vx {
 namespace ops {
+
+/**
+ * ## FullyConnected
+ *
+ * Denotes a fully (densely) connected layer, which connects all elements in the
+ * input tensor with each element in the output tensor. 
+ * 
+ * - axis: Describes the axis of the inputs when coerced to 2D.
+ * - weights: the output channel number for weight tensor.
+ */
+
 class FullyConnected : public Operation {
  public:
   FullyConnected(Graph* graph, uint32_t axis, uint32_t weights);
diff --git a/include/tim/vx/ops/gather.h b/include/tim/vx/ops/gather.h
index c788075..de294fa 100644
--- a/include/tim/vx/ops/gather.h
+++ b/include/tim/vx/ops/gather.h
@@ -29,6 +29,12 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## Gather
+ *
+ * Gather slices from input, **axis** according to **indices**.
+ */
+
 class Gather : public Operation {
  public:
   Gather(Graph* Graph, int axis);
diff --git a/include/tim/vx/ops/gathernd.h b/include/tim/vx/ops/gathernd.h
index 0759fe2..f4d92e6 100644
--- a/include/tim/vx/ops/gathernd.h
+++ b/include/tim/vx/ops/gathernd.h
@@ -29,6 +29,12 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## GatherNd
+ *
+ * An operation similar to Gather but gathers across multiple axis at once.
+ */
+
 class GatherNd : public Operation {
  public:
   GatherNd(Graph* Graph);
diff --git a/include/tim/vx/ops/l2normalization.h b/include/tim/vx/ops/l2normalization.h
index 33fa792..2e1f355 100644
--- a/include/tim/vx/ops/l2normalization.h
+++ b/include/tim/vx/ops/l2normalization.h
@@ -25,6 +25,18 @@
 #define TIM_VX_OPS_L2NOMALIZATION_H_
 #include "tim/vx/operation.h"
 
+/**
+ * ## L2Normalization
+ *
+ * Applies L2 normalization along the axis dimension:
+ *
+ * ```
+ * output[batch, row, col, channel] =
+ *  input[batch, row, col, channel] /
+ *  sqrt(sum_{c} pow(input[batch, row, col, c], 2))
+ * ```
+ */
+
 namespace tim {
 namespace vx {
 namespace ops {
diff --git a/include/tim/vx/ops/localresponsenormalization.h b/include/tim/vx/ops/localresponsenormalization.h
index 4465b20..8f31ff0 100644
--- a/include/tim/vx/ops/localresponsenormalization.h
+++ b/include/tim/vx/ops/localresponsenormalization.h
@@ -25,6 +25,18 @@
 #define TIM_VX_OPS_LOCALRESPONSENORMALIZATION_H_
 #include "tim/vx/operation.h"
 
+/**
+ * ## LocalResponseNormalization
+ *
+ * Applies Local Response Normalization along the depth dimension:
+ *
+ * ```
+ * sqr_sum[a, b, c, d] = sum(
+ *     pow(input[a, b, c, d - depth_radius : d + depth_radius + 1], 2))
+ *     output = input / pow((bias + alpha * sqr_sum), beta)
+ * ```
+ */
+
 namespace tim {
 namespace vx {
 namespace ops {
diff --git a/include/tim/vx/ops/logical.h b/include/tim/vx/ops/logical.h
index 38e370c..911ac16 100644
--- a/include/tim/vx/ops/logical.h
+++ b/include/tim/vx/ops/logical.h
@@ -29,6 +29,16 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## And
+ *
+ * Returns the truth value of x AND y element-wise. This operation supports broadcasting.
+ *
+ * ## Or
+ *
+ * Returns the truth value of x OR y element-wise. This operation supports broadcasting.
+ */
+
 #define DECLARE_LOGICAL_OP(NAME)           \
   class Logical##NAME : public Operation { \
    public:                                 \
diff --git a/include/tim/vx/ops/nbg.h b/include/tim/vx/ops/nbg.h
index 63805a6..fffac21 100644
--- a/include/tim/vx/ops/nbg.h
+++ b/include/tim/vx/ops/nbg.h
@@ -29,6 +29,13 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## NBG
+ *
+ * Network Binary Graph is a precompile technology, which can compile a fuse graph into
+ * a bianry file.
+ */
+
 class NBG : public Operation {
  public:
   NBG(Graph* graph, const char* binary, size_t input_count, size_t output_count);
diff --git a/include/tim/vx/ops/pad.h b/include/tim/vx/ops/pad.h
index 1111acc..49448e8 100644
--- a/include/tim/vx/ops/pad.h
+++ b/include/tim/vx/ops/pad.h
@@ -28,6 +28,15 @@
 namespace tim {
 namespace vx {
 namespace ops {
+
+/**
+ * ## Pad
+ *
+ * Pads a tensor.
+ *
+ * - const_val : the value to pad.
+ */
+
 class Pad : public Operation {
  public:
   Pad(Graph* graph, const std::vector<uint32_t>& front_size,
diff --git a/include/tim/vx/ops/pool2d.h b/include/tim/vx/ops/pool2d.h
index a39ecab..879a7d3 100644
--- a/include/tim/vx/ops/pool2d.h
+++ b/include/tim/vx/ops/pool2d.h
@@ -33,6 +33,18 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## Pool2d
+ *
+ * Performs an 2-D pooling operation.
+ *
+ * - type : MAX, AVG, L2 or AVG_ANDROID.
+ * - padding : AUTO, VALID or SAME.
+ * - ksize : filter size.
+ * - stride : stride along each spatial axis.
+ * - round_type : CEILING or FLOOR.
+ */
+
 class Pool2d : public Operation {
  public:
   Pool2d(Graph* graph, PoolType type, PadType padding,
diff --git a/include/tim/vx/ops/reduce.h b/include/tim/vx/ops/reduce.h
index 5eafd92..f846d16 100644
--- a/include/tim/vx/ops/reduce.h
+++ b/include/tim/vx/ops/reduce.h
@@ -29,6 +29,71 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## ReduceMin
+ *
+ * Reduces a tensor by computing the minimum of elements along given dimensions.
+ *
+ * - axis : the dimensions to reduce.
+ * - keep_dims : If keep_dims is true, the reduced dimensions are retained with
+ * length 1. Otherwise, the rank of the tensor is reduced by 1 for each entry
+ * in dimensions
+ *
+ * ## ReduceMax
+ *
+ * Reduces a tensor by computing the maximum of elements along given dimensions.
+ *
+ * - axis : the dimensions to reduce.
+ * - keep_dims : If keep_dims is true, the reduced dimensions are retained with
+ * length 1. Otherwise, the rank of the tensor is reduced by 1 for each entry
+ * in dimensions
+ *
+ * ## ReduceAny
+ *
+ * Reduces a tensor by computing the "logical or" of elements along given dimensions.
+ *
+ * - axis : the dimensions to reduce.
+ * - keep_dims : If keep_dims is true, the reduced dimensions are retained with
+ * length 1. Otherwise, the rank of the tensor is reduced by 1 for each entry
+ * in dimensions
+ *
+ * ## ReduceAll
+ *
+ * Reduces a tensor by computing the "logical and" of elements along given dimensions.
+ *
+ * - axis : the dimensions to reduce.
+ * - keep_dims : If keep_dims is true, the reduced dimensions are retained with
+ * length 1. Otherwise, the rank of the tensor is reduced by 1 for each entry
+ * in dimensions
+ *
+ * ## ReduceProd
+ *
+ * Reduces a tensor by computing the multiplying of elements along given dimensions.
+ *
+ * - axis : the dimensions to reduce.
+ * - keep_dims : If keep_dims is true, the reduced dimensions are retained with
+ * length 1. Otherwise, the rank of the tensor is reduced by 1 for each entry
+ * in dimensions
+ *
+ * ## ReduceMean
+ *
+ * Reduces a tensor by computing the mean of elements along given dimensions.
+ *
+ * - axis : the dimensions to reduce.
+ * - keep_dims : If keep_dims is true, the reduced dimensions are retained with
+ * length 1. Otherwise, the rank of the tensor is reduced by 1 for each entry
+ * in dimensions
+ *
+ * ## ReduceSum
+ *
+ * Reduces a tensor by computing the summing of elements along given dimensions.
+ *
+ * - axis : the dimensions to reduce.
+ * - keep_dims : If keep_dims is true, the reduced dimensions are retained with
+ * length 1. Otherwise, the rank of the tensor is reduced by 1 for each entry
+ * in dimensions
+ */
+
 #define DECLARE_REDUCE_OP(NAME)                                  \
   class Reduce##NAME : public Operation {                        \
    public:                                                       \
@@ -43,6 +108,7 @@ namespace ops {
 DECLARE_REDUCE_OP(Min);
 DECLARE_REDUCE_OP(Max);
 DECLARE_REDUCE_OP(Any);
+DECLARE_REDUCE_OP(All);
 DECLARE_REDUCE_OP(Prod);
 DECLARE_REDUCE_OP(Mean);
 DECLARE_REDUCE_OP(Sum);
diff --git a/include/tim/vx/ops/relational_operations.h b/include/tim/vx/ops/relational_operations.h
index b3997da..11c4fcc 100644
--- a/include/tim/vx/ops/relational_operations.h
+++ b/include/tim/vx/ops/relational_operations.h
@@ -29,6 +29,32 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## Greater
+ *
+ * For input tensors x and y, computes x > y elementwise.
+ *
+ * ## GreaterOrEqual
+ *
+ * For input tensors x and y, computes x >= y elementwise.
+ *
+ * ## Less
+ *
+ * For input tensors x and y, computes x < y elementwise.
+ *
+ * ## LessOrEqual
+ *
+ * For input tensors x and y, computes x <= y elementwise.
+ *
+ * ## NotEqual
+ *
+ * For input tensors x and y, computes x != y elementwise.
+ *
+ * ## Equal
+ *
+ * For input tensors x and y, computes x == y elementwise.
+ */
+
 #define DECLARE_RELATIONAL_OP(NAME)  \
   class NAME : public Operation {    \
    public:                           \
diff --git a/include/tim/vx/ops/reorg.h b/include/tim/vx/ops/reorg.h
index 18ccc89..22d115e 100644
--- a/include/tim/vx/ops/reorg.h
+++ b/include/tim/vx/ops/reorg.h
@@ -29,6 +29,12 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## Reorg
+ *
+ * The layer used in YOLOv2. See also https://github.com/pjreddie/darknet/blob/master/src/reorg_layer.c
+ */
+
 class Reorg : public Operation {
  public:
   Reorg(Graph* graph, const uint32_t stride);
diff --git a/include/tim/vx/ops/reshape.h b/include/tim/vx/ops/reshape.h
index 11eab62..eae55e0 100644
--- a/include/tim/vx/ops/reshape.h
+++ b/include/tim/vx/ops/reshape.h
@@ -29,6 +29,14 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## Reshape
+ *
+ * Given tensor, this operation returns a tensor that has the same values as tensor, but with a newly specified shape.
+ *
+ * - size : defining the shape of the output tensor.
+ */
+
 class Reshape : public Operation {
  public:
   Reshape(Graph* graph, const std::vector<uint32_t>& perm);
diff --git a/include/tim/vx/ops/resize.h b/include/tim/vx/ops/resize.h
index 4a3ca80..db0bf1b 100644
--- a/include/tim/vx/ops/resize.h
+++ b/include/tim/vx/ops/resize.h
@@ -29,6 +29,21 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## Resize
+ *
+ * Resizes images to given size.
+ *
+ * - type : NEAREST_NEIGHBOR, BILINEAR or AREA.
+ * - factor : scale the input size. DO NOT use it with target_height / target_width together.
+ * - align_corners : If True, the centers of the 4 corner pixels of the input and output
+ * tensors are aligned, preserving the values at the corner pixels.
+ * - half_pixel_centers : If True, the pixel centers are assumed to be at (0.5, 0.5).
+ * This is the default behavior of image.resize in TF 2.0. If this parameter is True,
+ * then align_corners parameter must be False.
+ * - target_height / target_width : output height / width. DO NOT use it with factor together.
+ */
+
 class Resize : public Operation {
  public:
   Resize(Graph* graph, ResizeType type, float factor, bool align_corners,
diff --git a/include/tim/vx/ops/reverse.h b/include/tim/vx/ops/reverse.h
index 7cf6c77..abfc82e 100644
--- a/include/tim/vx/ops/reverse.h
+++ b/include/tim/vx/ops/reverse.h
@@ -29,6 +29,14 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## Reverse
+ *
+ * Reverses specific dimensions of a tensor.
+ *
+ * - axis : The indices of the dimensions to reverse. 
+ */
+
 class Reverse : public Operation {
  public:
   Reverse(Graph* graph, int32_t* axis, uint32_t axis_num);
diff --git a/include/tim/vx/ops/select.h b/include/tim/vx/ops/select.h
index aa4c9a4..8a7ab34 100644
--- a/include/tim/vx/ops/select.h
+++ b/include/tim/vx/ops/select.h
@@ -29,6 +29,13 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## Select
+ *
+ * Using a tensor of booleans c and input tensors x and y select values elementwise
+ * from both input tensors: O[i] = C[i] ? x[i] : y[i].
+ */
+
 class Select : public Operation {
  public:
   Select(Graph* graph);
diff --git a/include/tim/vx/ops/simple_operations.h b/include/tim/vx/ops/simple_operations.h
index f817a14..3d9d347 100644
--- a/include/tim/vx/ops/simple_operations.h
+++ b/include/tim/vx/ops/simple_operations.h
@@ -35,6 +35,57 @@ namespace ops {
     NAME(Graph* graph);           \
   };
 
+/**
+ * ## DataConvert
+ *
+ * Change the format from input tensor to output tensor.
+ *
+ * ## Neg
+ *
+ * Neg(x) : -x
+ *
+ * ## Abs
+ *
+ * Abs(x) : x if x >= 0; -x if x < 0.
+ *
+ * ## Sin
+ *
+ * Sin(x) : sin(x)
+ *
+ * ## Exp
+ *
+ * Exp(x) : e^x
+ *
+ * ## Log
+ *
+ * Log(x) : ln(x)
+ *
+ * ## Sqrt
+ *
+ * Sqrt(x) : $$\sqrt{x}$$
+ *
+ * ## Rsqrt
+ *
+ * Rsqrt(x) : $$\frac{1}{\sqrt{x}}$$
+ *
+ * ## Square
+ *
+ * Square : x^2
+ *
+ * ## LogicalNot
+ *
+ * LogicalNot(x) : NOT x
+ *
+ * ## Floor
+ *
+ * returns the largest integer less than or equal to a given number.
+ *
+ * ## Cast
+ *
+ * Change the format from input tensor to output tensor. This operation ignores
+ * the scale and zeroPoint of quanized tensors.
+ */
+
 DECLARE_SIMPLE_OP(DataConvert)
 DECLARE_SIMPLE_OP(Neg)
 DECLARE_SIMPLE_OP(Abs)
diff --git a/include/tim/vx/ops/slice.h b/include/tim/vx/ops/slice.h
index 11d7834..4bacce0 100644
--- a/include/tim/vx/ops/slice.h
+++ b/include/tim/vx/ops/slice.h
@@ -29,6 +29,15 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## Slice
+ *
+ * Extracts a slice of specified size from the input tensor starting at a specified location.
+ *
+ * - start : the beginning indices of the slice in each dimension.
+ * - length : the size of the slice in each dimension.
+ */
+
 class Slice : public Operation {
  public:
   Slice(Graph* graph,
diff --git a/include/tim/vx/ops/softmax.h b/include/tim/vx/ops/softmax.h
index 049b1f2..54f9425 100644
--- a/include/tim/vx/ops/softmax.h
+++ b/include/tim/vx/ops/softmax.h
@@ -29,6 +29,19 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## Softmax
+ *
+ * Computes the softmax activation on the input tensor element-wise, per batch,
+ * by normalizing the input vector so the maximum coefficient is zero:
+ *
+ * ```
+ * output[batch, i] =
+ *     exp((input[batch, i] - max(input[batch, :])) * beta) /
+ *     sum_{k}{exp((input[batch, k] - max(input[batch, :])) * beta)}
+ * ```
+ */
+
 class Softmax : public Operation {
  public:
   Softmax(Graph* graph, float beta, int32_t axis);
diff --git a/include/tim/vx/ops/space2batch.h b/include/tim/vx/ops/space2batch.h
index 9237c4a..298d182 100644
--- a/include/tim/vx/ops/space2batch.h
+++ b/include/tim/vx/ops/space2batch.h
@@ -32,6 +32,20 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## Space2Batch
+ *
+ * This operation divides "spatial" dimensions [1, ..., M] of the input into a grid
+ * of blocks of shape **block_size**, and interleaves these blocks with the "batch"
+ * dimension (0) such that in the output, the spatial dimensions [1, ..., M] correspond
+ * to the position within the grid, and the batch dimension combines both the position
+ * within a spatial block and the original batch position. Prior to division into blocks,
+ * the spatial dimensions of the input are optionally zero padded according to paddings.
+ * This is the reverse transformation of Batch2Space.
+ *
+ * - pad : the paddings for each spatial dimension of the input tensor.
+ */
+
 class Space2Batch : public Operation {
  public:
   Space2Batch(Graph* graph, const std::vector<int>& block_size,
diff --git a/include/tim/vx/ops/space2depth.h b/include/tim/vx/ops/space2depth.h
index 5acd6b4..2ec07bc 100644
--- a/include/tim/vx/ops/space2depth.h
+++ b/include/tim/vx/ops/space2depth.h
@@ -29,6 +29,15 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## SpaceToDepth
+ *
+ * SpaceToDepth rearranges blocks of spatial data into depth. More specifically,
+ * this op outputs a copy of the input tensor where values from the height and
+ * width dimensions are moved to the depth dimension. This is the reverse
+ * transformation of DepthToSpace.
+ */
+
 class SpaceToDepth : public Operation {
  public:
   SpaceToDepth(Graph* graph, std::vector<int> block_size,
diff --git a/include/tim/vx/ops/split.h b/include/tim/vx/ops/split.h
index 5d70a65..ee5db22 100644
--- a/include/tim/vx/ops/split.h
+++ b/include/tim/vx/ops/split.h
@@ -31,6 +31,15 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## Split
+ *
+ * Splits a tensor along a given axis into num_splits subtensors.
+ *
+ * - axis : the axis along which to split.
+ * - slices : ndicating the number of splits along given axis.
+ */
+
 class Split : public Operation {
  public:
   Split(Graph* graph, uint32_t axis, std::vector<uint32_t> slices);
diff --git a/include/tim/vx/ops/squeeze.h b/include/tim/vx/ops/squeeze.h
index 2166353..1e78832 100644
--- a/include/tim/vx/ops/squeeze.h
+++ b/include/tim/vx/ops/squeeze.h
@@ -30,6 +30,14 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## Squeeze
+ *
+ * Removes dimensions of size 1 from the shape of a tensor. 
+ *
+ * - axis : the dimensions to squeeze.
+ */
+
 class Squeeze : public Operation {
  public:
   Squeeze(Graph* graph, std::vector<uint32_t> axis);
diff --git a/include/tim/vx/ops/stack.h b/include/tim/vx/ops/stack.h
index 2c2d92d..f5bd76d 100644
--- a/include/tim/vx/ops/stack.h
+++ b/include/tim/vx/ops/stack.h
@@ -29,6 +29,13 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## Stack
+ *
+ * Packs the list of tensors in inputs into a tensor with rank one higher than
+ * each tensor in values, by packing them along the **axis** dimension. 
+ */
+
 class Stack : public Operation {
  public:
   Stack(Graph* graph, uint32_t axis, int input_cnt);
diff --git a/include/tim/vx/ops/stridedslice.h b/include/tim/vx/ops/stridedslice.h
index 10c78d6..fc2e8e9 100644
--- a/include/tim/vx/ops/stridedslice.h
+++ b/include/tim/vx/ops/stridedslice.h
@@ -29,6 +29,29 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## StridedSlice
+ *
+ * Extracts a strided slice of a tensor.
+ *
+ * Roughly speaking, this op extracts a slice of size (end - begin) / stride from
+ * the given input tensor. Starting at the location specified by begin the slice
+ * continues by adding stride to the index until all dimensions are not less than end.
+ * Note that a stride can be negative, which causes a reverse slice.
+ *
+ * - begin_dims : the starts of the dimensions of the input tensor to be sliced.
+ * - end_dims : the ends of the dimensions of the input tensor to be sliced.
+ * - stride_dims : the strides of the dimensions of the input tensor to be sliced.
+ * - begin_mask :  if the ith bit of begin_mask is set, begin[i] is ignored and
+ * the fullest possible range in that dimension is used instead.
+ * - end_mask : if the ith bit of end_mask is set, end[i] is ignored and the fullest
+ * possible range in that dimension is used instead.
+ * - shrink_axis_mask : if the ith bit of shrink_axis_mask is set, the ith dimension
+ * specification shrinks the dimensionality by 1, taking on the value at index begin[i].
+ * In this case, the ith specification must define a slice of size 1,
+ * e.g. begin[i] = x, end[i] = x + 1.
+ */
+
 class StridedSlice : public Operation {
  public:
   StridedSlice(Graph* graph, const std::vector<int32_t> begin_dims,
diff --git a/include/tim/vx/ops/transpose.h b/include/tim/vx/ops/transpose.h
index ee6f18f..ead3f7b 100644
--- a/include/tim/vx/ops/transpose.h
+++ b/include/tim/vx/ops/transpose.h
@@ -29,6 +29,18 @@ namespace tim {
 namespace vx {
 namespace ops {
 
+/**
+ * ## Transpose
+ *
+ * Transposes the input tensor, permuting the dimensions according to the
+ * **perm** tensor.
+ *
+ * The returned tensor's dimension i corresponds to the input dimension perm[i].
+ * If perm is not given, it is set to (n-1...0), where n is the rank of the input
+ * tensor. Hence by default, this operation performs a regular matrix transpose on
+ * 2-D input Tensors.
+ */
+
 class Transpose : public Operation {
  public:
   Transpose(Graph* graph, const std::vector<uint32_t>& perm);
diff --git a/src/tim/vx/ops/reduce.cc b/src/tim/vx/ops/reduce.cc
index f424e3e..11cdf9e 100644
--- a/src/tim/vx/ops/reduce.cc
+++ b/src/tim/vx/ops/reduce.cc
@@ -45,6 +45,7 @@ namespace ops {
 DEFINE_REDUCE_OP(Min, VSI_NN_REDUCE_MIN);
 DEFINE_REDUCE_OP(Max, VSI_NN_REDUCE_MAX);
 DEFINE_REDUCE_OP(Any, VSI_NN_REDUCE_ANY);
+DEFINE_REDUCE_OP(All, VSI_NN_REDUCE_ALL);
 DEFINE_REDUCE_OP(Prod, VSI_NN_REDUCE_PROD);
 DEFINE_REDUCE_OP(Mean, VSI_NN_REDUCE_MEAN);
 DEFINE_REDUCE_OP(Sum, VSI_NN_REDUCE_SUM);