Source code for jax_utils.markov_decision_process

"""High-level abstractions for decision problems (Markov Decision processes, etc...)"""

from typing import Protocol, Tuple, TypeVar

State = TypeVar("State")
State_contra = TypeVar("State_contra", contravariant=True)
Action = TypeVar("Action")
Action_contra = TypeVar("Action_contra", contravariant=True)
Observation = TypeVar("Observation")
Observation_co = TypeVar("Observation_co", covariant=True)
Observation_contra = TypeVar("Observation_contra", contravariant=True)
Cost = TypeVar("Cost")
Cost_co = TypeVar("Cost_co", covariant=True)
Cost_contra = TypeVar("Cost_contra", contravariant=True)
RegularizedCost = TypeVar("RegularizedCost")


[docs] class Dynamics(Protocol[State, Action_contra, Cost_co, Observation_co]): """Interface defining the dynamics of a `(Partially Observable) <https://en.wikipedia.org/wiki/Partially_observable_Markov_decision_process>`_ `Markov Decision Process <https://en.wikipedia.org/wiki/Markov_decision_process>`_. When an "agent" interacting with the (PO)MDP plays an "action" (a.k.a. "control") in a given "state", the (PO)MDP transitions to a new state and the agent observes some signal/feedback in the form of a "cost"/"reward" as well as additional "observations". A `Dynamics` is therefore a callable that maps a state-action pair to a state-cost-observation tuple. """ def __call__( self, state: State, action: Action_contra ) -> Tuple[State, Cost_co, Observation_co]: pass
[docs] class CostRegularizer( Protocol[State_contra, Action_contra, Cost_contra, Observation_contra, Cost_co] ): """Interface for callables that map any state-action-cost-observation tuple to a new "regularized" cost. `More about regularization <https://en.wikipedia.org/wiki/Regularization_(mathematics)>`_. Example: one may want to penalize `action` with high norms, etc... """ def __call__( self, state: State_contra, action: Action_contra, cost: Cost_contra, observation: Observation_contra, ) -> Cost_co: pass
[docs] class RegularizedDynamics( Dynamics[State, Action, RegularizedCost, Observation], Protocol[State, Action, Cost, RegularizedCost, Observation], ): """Interface defining a wrapper around class :class:`jax_utils.markov_decision_process.Dynamics` that allows to add a regularization to the cost. A ``RegularizedDynamics`` is itself a :class:`jax_utils.markov_decision_process.Dynamics``. `More about regularization <https://en.wikipedia.org/wiki/Regularization_(mathematics)>`_. Args: dynamics (Dynamics[State, Action, Cost, Observation]): callable defining the dynamics of a (PO)MDP cost_regularizer (CostRegularizer[State, Action, Cost, Observation, RegularizedCost]): callable defining a cost transformation (regularization) """ dynamics: Dynamics[State, Action, Cost, Observation] cost_regularizer: CostRegularizer[State, Action, Cost, Observation, RegularizedCost] def __call__(self, state: State, action: Action) -> Tuple[State, RegularizedCost, Observation]: state, cost, observation = self.dynamics(state, action) regularized_cost = self.cost_regularizer(state, action, cost, observation) return state, regularized_cost, observation