Source code for auto_diff.op.op_dot

from typing import Mapping, Union
import numpy as np
from .operation import Operation
from .op_placeholder import OpPlaceholder


[docs]class OpDot(Operation):
    """Dot product of two tensors.

    **If** either `x` or `y` is a scalar, then it is equivalent to :class:`OpMultiply`.

    **If** both `x` and `y` are 1-D arrays, it is the inner product of vectors, the result is a scalar:

    .. math::
       z = \sum_k x_{k} \cdot y_{k}

    Partial derivatives of a single element:

    .. math::
       \\frac{\\partial L}{\\partial x_{i}} =
       \\frac{\partial L}{\\partial z} \\cdot \\frac{\\partial z}{\\partial x_i} =
       \\frac{\\partial L}{\\partial z} \\cdot y_i

    .. math::
       \\frac{\\partial L}{\\partial y_{i}} =
       \\frac{\\partial L}{\\partial z} \\cdot \\frac{\\partial z}{\\partial y_i} =
       \\frac{\\partial L}{\\partial z} \\cdot x_i

    Vector derivatives:

    .. math::
       \\frac{\\partial L}{\\partial x} = \\frac{\\partial L}{\\partial z} \\cdot y

    .. math::
       \\frac{\\partial L}{\\partial y} = \\frac{\\partial L}{\\partial z} \\cdot x

    Note that since `z` is a scalar, the calculation of vector derivatives in this case is the dot operation.

    **If** both `x` and `y` are 2-D arrays, it is the matrix multiplication, the result is a 2-D array:

    .. math::
       z_{ij} = \sum_{k} x_{ik} \cdot y_{kj}

    Partial derivative of a single element:

    .. math::
       \\begin{array}{rcl}
       \\displaystyle \\frac{\\partial L}{\\partial x_{ij}}
       &=& \\displaystyle \\sum_{a,b} \\frac{\\partial L}{\\partial z_{ab}} \\cdot
       \\frac{\\partial z_{ab}}{\\partial x_{ij}} \\\\
       &=& \\displaystyle \\sum_{a,b} \\frac{\\partial L}{\\partial z_{ab}} \\cdot
       \\frac{\\partial \\left ( \\sum_k x_{ak} \\cdot y_{kb} \\right )}{\\partial x_{ij}} \\\\
       &=& \\displaystyle \\sum_{b} \\frac{\\partial L}{\\partial z_{ib}} \\cdot
       \\frac{\\partial \\left ( x_{ij} \\cdot y_{jb} \\right )}{\\partial x_{ij}} \\\\
       &=& \\displaystyle \\sum_{k} \\frac{\\partial L}{\\partial z_{ik}} \\cdot
       y_{jk} \\\\
       &=& \\displaystyle \\sum_{k} \\left ( \\frac{\\partial L}{\\partial Z} \\right )_{ik} \\cdot (Y^T)_{kj} \\\\
       \\end{array}

    .. math::
       \\frac{\\partial L}{\\partial y_{ij}} = \\sum_{k} (X^T)_{ik} \\cdot
       \\left ( \\frac{\\partial L}{\\partial Z} \\right )_{kj}

    The results of partial derivatives are the same as the definition of the dot operation, therefore the matrix
    derivatives are:

    .. math::
       \\frac{\\partial L}{\\partial X} = \\frac{\\partial L}{\\partial Z} \\cdot Y^T

    .. math::
       \\frac{\\partial L}{\\partial Y} = X^T \\cdot \\frac{\\partial L}{\\partial Z}

    **If** `x` is an N-D tensor and `y` is an M-D tensor (M >= 2), it is a sum product over the last axis of `x` and
    second-to-last axis of `y`.

    **If** `x` is an N-D tensor and `y` is a 1-D array, it is a sum product over the last axis of `x` and `y`.
    It is a special case of the previous condition if `y` is considered as a K x 1 matrix and result is squeezed.
    """

[docs]    def __init__(self, x: Operation, y: Operation, **kwargs):
        self.inputs = [x, y]
        if x.isscalar():
            self.shape = y.shape
        elif y.isscalar():
            self.shape = x.shape
        elif x.dim == 1 and y.dim == 1:
            if x.shape[0] != y.shape[0]:
                raise ValueError('The dimensions of inputs should be equal, found %s and %s'
                                 % (str(x.shape), str(y.shape)))
            self.shape = ()
        elif x.dim == 2 and y.dim == 2:
            if x.shape[1] != y.shape[0]:
                raise ValueError('The last dimension of the first input and the first dimension of the second input '
                                 'should be equal, found %s and %s' % (str(x.shape), str(y.shape)))
            self.shape = (x.shape[0], y.shape[1])
        elif y.dim == 1:
            if x.shape[-1] != y.shape[0]:
                raise ValueError('The last dimension of the first input and dimension of the second input '
                                 'should be equal, found %s and %s' % (str(x.shape), str(y.shape)))
            self.shape = x.shape[:-1]
        else:
            if x.shape[-1] != y.shape[-2]:
                raise ValueError('The last dimension of the first input and second-to-last dimension of the second '
                                 'input should be equal, found %s and %s' % (str(x.shape), str(y.shape)))
            self.shape = x.shape[:-1] + y.shape[:-2] + (y.shape[-1],)
        super(OpDot, self).__init__(**kwargs)

    def _get_name(self) -> str:
        return 'dot(%s, %s)' % (self.inputs[0].name, self.inputs[1].name)

    def _get_op_name(self) -> str:
        return 'dot(%s, %s)' % (self.inputs[0]._op_name, self.inputs[1]._op_name)

    def _forward(self, feed_dict: Mapping[Union[str, OpPlaceholder], np.ndarray]) -> np.ndarray:
        return np.dot(self.inputs[0].forward(feed_dict), self.inputs[1].forward(feed_dict))

    def _backward(self, gradient: Operation) -> None:
        x, y = self.inputs
        if x.isscalar():
            self.gradient = [
                (gradient * y).sum(),
                gradient * x,
            ]
        elif y.isscalar():
            self.gradient = [
                gradient * y,
                (gradient * x).sum(),
            ]
        elif x.dim == 1 and y.dim == 1:
            self.gradient = [
                gradient * y,
                gradient * x,
            ]
        elif x.dim == 2 and y.dim == 2:
            self.gradient = [
                gradient.dot(y.transpose()),
                x.transpose().dot(gradient),
            ]
        elif y.dim == 1:
            self.gradient = [
                gradient.expand_dims(axis=-1).dot(y.expand_dims(axis=0)),
                (x * gradient.expand_dims(axis=-1)).sum(axis=tuple(range(x.dim - 1))),
            ]
        else:
            zx_dim, zy_dim = x.dim - 1, y.dim - 2
            self.gradient = [
                gradient.sum(axis=tuple(range(zx_dim, zx_dim + zy_dim))).dot(
                    y.sum(axis=tuple(range(zy_dim))).transpose()
                ) * (1.0 / np.prod(y.shape[:zy_dim])),
                gradient.sum(axis=tuple(range(zx_dim))).expand_dims(axis=-1).dot(
                    x.sum(axis=tuple(range(zx_dim))).expand_dims(axis=0)
                ).transpose(axes=tuple(range(zy_dim)) + (-1, -2)) * (1.0 / np.prod(x.shape[:zx_dim])),
            ]
        x.backward(self.gradient[0])
        y.backward(self.gradient[1])