Spaces:

zjowowen
/

gomoku

Sleeping

App Files Files Community

zjowowen commited on Jan 24

Commit

079c32c

•

1 Parent(s): a73e77c

init space

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

DI-engine +0 -1
DI-engine/.flake8 +4 -0
DI-engine/.gitignore +1431 -0
DI-engine/.style.yapf +11 -0
DI-engine/CHANGELOG +489 -0
DI-engine/CODE_OF_CONDUCT.md +128 -0
DI-engine/CONTRIBUTING.md +7 -0
DI-engine/LICENSE +202 -0
DI-engine/Makefile +71 -0
DI-engine/README.md +475 -0
DI-engine/cloc.sh +69 -0
DI-engine/codecov.yml +8 -0
DI-engine/conda/conda_build_config.yaml +2 -0
DI-engine/conda/meta.yaml +35 -0
DI-engine/ding/__init__.py +12 -0
DI-engine/ding/bonus/__init__.py +132 -0
DI-engine/ding/bonus/a2c.py +460 -0
DI-engine/ding/bonus/c51.py +459 -0
DI-engine/ding/bonus/common.py +22 -0
DI-engine/ding/bonus/config.py +326 -0
DI-engine/ding/bonus/ddpg.py +456 -0
DI-engine/ding/bonus/dqn.py +460 -0
DI-engine/ding/bonus/model.py +245 -0
DI-engine/ding/bonus/pg.py +453 -0
DI-engine/ding/bonus/ppo_offpolicy.py +471 -0
DI-engine/ding/bonus/ppof.py +509 -0
DI-engine/ding/bonus/sac.py +457 -0
DI-engine/ding/bonus/sql.py +461 -0
DI-engine/ding/bonus/td3.py +455 -0
DI-engine/ding/compatibility.py +9 -0
DI-engine/ding/config/__init__.py +4 -0
DI-engine/ding/config/config.py +579 -0
DI-engine/ding/config/example/A2C/__init__.py +17 -0
DI-engine/ding/config/example/A2C/gym_bipedalwalker_v3.py +43 -0
DI-engine/ding/config/example/A2C/gym_lunarlander_v2.py +38 -0
DI-engine/ding/config/example/C51/__init__.py +23 -0
DI-engine/ding/config/example/C51/gym_lunarlander_v2.py +52 -0
DI-engine/ding/config/example/C51/gym_pongnoframeskip_v4.py +54 -0
DI-engine/ding/config/example/C51/gym_qbertnoframeskip_v4.py +54 -0
DI-engine/ding/config/example/C51/gym_spaceInvadersnoframeskip_v4.py +54 -0
DI-engine/ding/config/example/DDPG/__init__.py +29 -0
DI-engine/ding/config/example/DDPG/gym_bipedalwalker_v3.py +45 -0
DI-engine/ding/config/example/DDPG/gym_halfcheetah_v3.py +53 -0
DI-engine/ding/config/example/DDPG/gym_hopper_v3.py +53 -0
DI-engine/ding/config/example/DDPG/gym_lunarlandercontinuous_v2.py +60 -0
DI-engine/ding/config/example/DDPG/gym_pendulum_v1.py +52 -0
DI-engine/ding/config/example/DDPG/gym_walker2d_v3.py +53 -0
DI-engine/ding/config/example/DQN/__init__.py +23 -0
DI-engine/ding/config/example/DQN/gym_lunarlander_v2.py +53 -0
DI-engine/ding/config/example/DQN/gym_pongnoframeskip_v4.py +50 -0

DI-engine DELETED Viewed

	@@ -1 +0,0 @@
1	- Subproject commit a57bc3024b938c881aaf6511d1fb26296cd98601

DI-engine/.flake8 ADDED Viewed

	@@ -0,0 +1,4 @@

+[flake8]
+ignore=F401,F841,F403,E226,E126,W504,E265,E722,W503,W605,E741,E122,E731
+max-line-length=120
+statistics

DI-engine/.gitignore ADDED Viewed

	@@ -0,0 +1,1431 @@

+# Created by .ignore support plugin (hsz.mobi)
+### ArchLinuxPackages template
+*.tar
+*.tar.*
+*.jar
+*.exe
+*.msi
+*.zip
+*.tgz
+*.log
+*.log.*
+*.sig
+*.mov
+*.pkl
+pkg/
+src/
+impala_log/
+### CVS template
+/CVS/*
+**/CVS/*
+.cvsignore
+*/.cvsignore
+### LibreOffice template
+# LibreOffice locks
+.~lock.*#
+### CUDA template
+*.i
+*.ii
+*.gpu
+*.ptx
+*.cubin
+*.fatbin
+### Eclipse template
+*.bin
+.metadata
+bin/
+tmp/
+*.tmp
+*.bak
+*.swp
+*~.nib
+local.properties
+.settings/
+.loadpath
+.recommenders
+# External tool builders
+.externalToolBuilders/
+# Locally stored "Eclipse launch configurations"
+*.launch
+# PyDev specific (Python IDE for Eclipse)
+*.pydevproject
+# CDT-specific (C/C++ Development Tooling)
+.cproject
+# CDT- autotools
+.autotools
+# Java annotation processor (APT)
+.factorypath
+# PDT-specific (PHP Development Tools)
+.buildpath
+# sbteclipse plugin
+.target
+# Tern plugin
+.tern-project
+# TeXlipse plugin
+.texlipse
+# STS (Spring Tool Suite)
+.springBeans
+# Code Recommenders
+.recommenders/
+# Annotation Processing
+.apt_generated/
+.apt_generated_test/
+# Scala IDE specific (Scala & Java development for Eclipse)
+.cache-main
+.scala_dependencies
+.worksheet
+# Uncomment this line if you wish to ignore the project description file.
+# Typically, this file would be tracked if it contains build/dependency configurations:
+#.project
+### SVN template
+.svn/
+### Images template
+# JPEG
+*.jpg
+*.jpeg
+*.jpe
+*.jif
+*.jfif
+*.jfi
+# JPEG 2000
+*.jp2
+*.j2k
+*.jpf
+*.jpx
+*.jpm
+*.mj2
+# JPEG XR
+*.jxr
+*.hdp
+*.wdp
+# Graphics Interchange Format
+*.gif
+*.mp4
+*.mpg
+# RAW
+*.raw
+# Web P
+*.webp
+# Portable Network Graphics
+*.png
+# Animated Portable Network Graphics
+*.apng
+# Multiple-image Network Graphics
+*.mng
+# Tagged Image File Format
+*.tiff
+*.tif
+# Scalable Vector Graphics
+*.svg
+*.svgz
+# Portable Document Format
+*.pdf
+# X BitMap
+*.xbm
+# BMP
+*.bmp
+*.dib
+# ICO
+*.ico
+# 3D Images
+*.3dm
+*.max
+### Diff template
+*.patch
+*.diff
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+# Generated files
+.idea/**/contentModel.xml
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+# CMake
+cmake-build-*/
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+# File-based project format
+*.iws
+# IntelliJ
+out/
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+# JIRA plugin
+atlassian-ide-plugin.xml
+# Cursive Clojure plugin
+.idea/replstate.xml
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+# Editor-based Rest Client
+.idea/httpRequests
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+### CodeIgniter template
+*/config/development
+*/logs/log-*.php
+!*/logs/index.html
+*/cache/*
+!*/cache/index.html
+!*/cache/.htaccess
+user_guide_src/build/*
+user_guide_src/cilexer/build/*
+user_guide_src/cilexer/dist/*
+user_guide_src/cilexer/pycilexer.egg-info/*
+#codeigniter 3
+application/logs/*
+!application/logs/index.html
+!application/logs/.htaccess
+/vendor/
+### Emacs template
+# -*- mode: gitignore; -*-
+*~
+\#*\#
+/.emacs.desktop
+/.emacs.desktop.lock
+*.elc
+auto-save-list
+tramp
+.\#*
+# Org-mode
+.org-id-locations
+*_archive
+# flymake-mode
+*_flymake.*
+# eshell files
+/eshell/history
+/eshell/lastdir
+# elpa packages
+/elpa/
+# reftex files
+*.rel
+# AUCTeX auto folder
+/auto/
+# cask packages
+.cask/
+dist/
+# Flycheck
+flycheck_*.el
+# server auth directory
+/server/
+# projectiles files
+.projectile
+# directory configuration
+.dir-locals.el
+# network security
+/network-security.data
+### Windows template
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+# Dump file
+*.stackdump
+# Folder config file
+[Dd]esktop.ini
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+# Windows Installer files
+*.cab
+*.msix
+*.msm
+*.msp
+# Windows shortcuts
+*.lnk
+### VisualStudioCode template
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+# Local History for Visual Studio Code
+.history/
+### CMake template
+CMakeLists.txt.user
+CMakeCache.txt
+CMakeFiles
+CMakeScripts
+Testing
+cmake_install.cmake
+install_manifest.txt
+compile_commands.json
+CTestTestfile.cmake
+_deps
+### VisualStudio template
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+# Mono auto generated files
+mono_crash.*
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Ww][Ii][Nn]32/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+# ASP.NET Scaffolding
+ScaffoldingReadMe.txt
+# StyleCop
+StyleCopReport.xml
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp_proj
+*_wpftmp.csproj
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+# Chutzpah Test files
+_Chutzpah*
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+# Visual Studio Trace Files
+*.e2e
+# TFS 2012 Local Workspace
+$tf/
+# Guidance Automation Toolkit
+*.gpState
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+# TeamCity is a build add-in
+_TeamCity*
+# DotCover is a Code Coverage Tool
+*.dotCover
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+# Coverlet is a free, cross platform Code Coverage Tool
+coverage*.json
+coverage*.xml
+coverage*.info
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+# Web workbench (sass)
+.sass-cache/
+# Installshield output folder
+[Ee]xpress/
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+# Click-Once directory
+publish/
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+# Microsoft Azure Emulator
+ecf/
+rcf/
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+# Others
+ClientBin/
+~$*
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+# RIA/Silverlight projects
+Generated_Code/
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+# Microsoft Fakes
+FakesAssemblies/
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+# Visual Studio 6 build log
+*.plg
+# Visual Studio 6 workspace options file
+*.opt
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+# FAKE - F# Make
+.fake/
+# CodeRush personal settings
+.cr/personal
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+# Tabs Studio
+*.tss
+# Telerik's JustMock configuration file
+*.jmconfig
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+# OpenCover UI analysis results
+OpenCover/
+# Azure Stream Analytics local run output
+ASALocalRun/
+# MSBuild Binary and Structured Log
+*.binlog
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+# Local History for Visual Studio
+.localhistory/
+# BeatPulse healthcheck temp database
+healthchecksdb
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
+# Fody - auto-generated XML schema
+FodyWeavers.xsd
+### Python template
+# Byte-compiled / optimized / DLL files
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+venv/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+### Backup template
+*.gho
+*.ori
+*.orig
+### Node template
+# Logs
+logs
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+lerna-debug.log*
+# Diagnostic reports (https://nodejs.org/api/report.html)
+report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
+# Runtime data
+pids
+*.pid
+*.seed
+*.pid.lock
+# Directory for instrumented libs generated by jscoverage/JSCover
+lib-cov
+# Coverage directory used by tools like istanbul
+coverage
+*.lcov
+# nyc test coverage
+.nyc_output
+# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
+.grunt
+# Bower dependency directory (https://bower.io/)
+bower_components
+# node-waf configuration
+.lock-wscript
+# Compiled binary addons (https://nodejs.org/api/addons.html)
+build/Release
+# Dependency directories
+jspm_packages/
+# Snowpack dependency directory (https://snowpack.dev/)
+web_modules/
+# TypeScript cache
+*.tsbuildinfo
+# Optional npm cache directory
+.npm
+# Optional eslint cache
+.eslintcache
+# Microbundle cache
+.rpt2_cache/
+.rts2_cache_cjs/
+.rts2_cache_es/
+.rts2_cache_umd/
+# Optional REPL history
+.node_repl_history
+# Output of 'npm pack'
+# Yarn Integrity file
+.yarn-integrity
+# dotenv environment variables file
+.env.test
+# parcel-bundler cache (https://parceljs.org/)
+.parcel-cache
+# Next.js build output
+.next
+out
+# Nuxt.js build / generate output
+.nuxt
+dist
+# Gatsby files
+.cache/
+# Comment in the public line in if your project uses Gatsby and not Next.js
+# https://nextjs.org/blog/next-9-1#public-directory-support
+# public
+# vuepress build output
+.vuepress/dist
+# Serverless directories
+.serverless/
+# FuseBox cache
+.fusebox/
+# DynamoDB Local files
+.dynamodb/
+# TernJS port file
+.tern-port
+# Stores VSCode versions used for testing VSCode extensions
+.vscode-test
+# yarn v2
+.yarn/cache
+.yarn/unplugged
+.yarn/build-state.yml
+.yarn/install-state.gz
+.pnp.*
+### VirtualEnv template
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+pyvenv.cfg
+pip-selfcheck.json
+### macOS template
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+# Icon must end with two \r
+Icon
+# Thumbnails
+._*
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+### Go template
+# Binaries for programs and plugins
+*.exe~
+*.dll
+*.dylib
+# Test binary, built with `go test -c`
+*.test
+# Output of the go coverage tool, specifically when used with LiteIDE
+*.out
+# Dependency directories (remove the comment below to include it)
+# vendor/
+### C template
+# Prerequisites
+*.d
+# Object files
+*.o
+*.ko
+*.elf
+# Linker output
+*.map
+*.exp
+# Precompiled Headers
+*.gch
+# Libraries
+*.lib
+*.a
+*.la
+*.lo
+# Shared objects (inc. Windows DLLs)
+*.so.*
+# Executables
+*.app
+*.i*86
+*.x86_64
+*.hex
+# Debug files
+*.dSYM/
+*.su
+*.idb
+# Kernel Module Compile Results
+*.mod*
+*.cmd
+.tmp_versions/
+modules.order
+Module.symvers
+Mkfile.old
+dkms.conf
+### Example user template template
+### Example user template
+# IntelliJ project files
+.idea
+*.iml
+gen
+### TextMate template
+*.tmproj
+*.tmproject
+tmtags
+### Anjuta template
+# Local configuration folder and symbol database
+/.anjuta/
+/.anjuta_sym_db.db
+### XilinxISE template
+# intermediate build files
+*.bgn
+*.bit
+*.bld
+*.cmd_log
+*.drc
+*.ll
+*.lso
+*.msd
+*.msk
+*.ncd
+*.ngc
+*.ngd
+*.ngr
+*.pad
+*.par
+*.pcf
+*.prj
+*.ptwx
+*.rbb
+*.rbd
+*.stx
+*.syr
+*.twr
+*.twx
+*.unroutes
+*.ut
+*.xpi
+*.xst
+*_bitgen.xwbt
+*_envsettings.html
+*_map.map
+*_map.mrp
+*_map.ngm
+*_map.xrpt
+*_ngdbuild.xrpt
+*_pad.csv
+*_pad.txt
+*_par.xrpt
+*_summary.html
+*_summary.xml
+*_usage.xml
+*_xst.xrpt
+# iMPACT generated files
+_impactbatch.log
+impact.xsl
+impact_impact.xwbt
+ise_impact.cmd
+webtalk_impact.xml
+# Core Generator generated files
+xaw2verilog.log
+# project-wide generated files
+*.gise
+par_usage_statistics.html
+usage_statistics_webtalk.html
+webtalk.log
+webtalk_pn.xml
+# generated folders
+iseconfig/
+xlnx_auto_0_xdb/
+xst/
+_ngo/
+_xmsgs/
+### TortoiseGit template
+# Project-level settings
+/.tgitconfig
+### C++ template
+# Prerequisites
+# Compiled Object files
+*.slo
+# Precompiled Headers
+# Compiled Dynamic libraries
+# Fortran module files
+*.mod
+*.smod
+# Compiled Static libraries
+*.lai
+# Executables
+### SublimeText template
+# Cache files for Sublime Text
+*.tmlanguage.cache
+*.tmPreferences.cache
+*.stTheme.cache
+# Workspace files are user-specific
+*.sublime-workspace
+# Project files should be checked into the repository, unless a significant
+# proportion of contributors will probably not be using Sublime Text
+# *.sublime-project
+# SFTP configuration file
+sftp-config.json
+sftp-config-alt*.json
+# Package control specific files
+Package Control.last-run
+Package Control.ca-list
+Package Control.ca-bundle
+Package Control.system-ca-bundle
+Package Control.cache/
+Package Control.ca-certs/
+Package Control.merged-ca-bundle
+Package Control.user-ca-bundle
+oscrypto-ca-bundle.crt
+bh_unicode_properties.cache
+# Sublime-github package stores a github token in this file
+# https://packagecontrol.io/packages/sublime-github
+GitHub.sublime-settings
+### Vim template
+# Swap
+[._]*.s[a-v][a-z]
+!*.svg  # comment out if you don't need vector files
+[._]*.sw[a-p]
+[._]s[a-rt-v][a-z]
+[._]ss[a-gi-z]
+[._]sw[a-p]
+# Session
+Session.vim
+Sessionx.vim
+# Temporary
+.netrwhist
+# Auto-generated tag files
+tags
+# Persistent undo
+[._]*.un~
+### Autotools template
+# http://www.gnu.org/software/automake
+Makefile.in
+/ar-lib
+/mdate-sh
+/py-compile
+/test-driver
+/ylwrap
+.deps/
+.dirstamp
+# http://www.gnu.org/software/autoconf
+autom4te.cache
+/autoscan.log
+/autoscan-*.log
+/aclocal.m4
+/compile
+/config.guess
+/config.h.in
+/config.log
+/config.status
+/config.sub
+/configure
+/configure.scan
+/depcomp
+/install-sh
+/missing
+/stamp-h1
+# https://www.gnu.org/software/libtool/
+/ltmain.sh
+# http://www.gnu.org/software/texinfo
+/texinfo.tex
+# http://www.gnu.org/software/m4/
+m4/libtool.m4
+m4/ltoptions.m4
+m4/ltsugar.m4
+m4/ltversion.m4
+m4/lt~obsolete.m4
+# Generated Makefile
+# (meta build system like autotools,
+# can automatically generate from config.status script
+# (which is called by configure script))
+### Lua template
+# Compiled Lua sources
+luac.out
+# luarocks build files
+*.src.rock
+*.tar.gz
+# Object files
+*.os
+# Precompiled Headers
+# Libraries
+*.def
+# Shared objects (inc. Windows DLLs)
+# Executables
+### Vagrant template
+# General
+.vagrant/
+# Log files (if you are creating logs in debug mode, uncomment this)
+# *.log
+### Xcode template
+# Xcode
+#
+# gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore
+## User settings
+xcuserdata/
+## compatibility with Xcode 8 and earlier (ignoring not required starting Xcode 9)
+*.xcscmblueprint
+*.xccheckout
+## compatibility with Xcode 3 and earlier (ignoring not required starting Xcode 4)
+DerivedData/
+*.moved-aside
+*.pbxuser
+!default.pbxuser
+*.mode1v3
+!default.mode1v3
+*.mode2v3
+!default.mode2v3
+*.perspectivev3
+!default.perspectivev3
+## Gcc Patch
+/*.gcno
+### Linux template
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+# KDE directory preferences
+.directory
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+### GitBook template
+# Node rules:
+## Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
+## Dependency directory
+## Commenting this out is preferred by some people, see
+## https://docs.npmjs.com/misc/faq#should-i-check-my-node_modules-folder-into-git
+node_modules
+# Book build output
+_book
+# eBook build output
+*.epub
+*.mobi
+### CodeSniffer template
+# gitignore for the PHP Codesniffer framework
+# website: https://github.com/squizlabs/PHP_CodeSniffer
+#
+# Recommended template: PHP.gitignore
+/wpcs/*
+### PuTTY template
+# Private key
+*.ppk
+*_pb2.py
+*.pth
+*.pth.tar
+*.pt
+*.npy
+__pycache__
+*.egg-info
+experiment_config.yaml
+api-log/
+log/
+htmlcov
+*.lock
+.coverage*
+/test_*
+.python-version
+/name.txt
+/summary_log
+policy_*
+/data
+.vscode
+formatted_*
+**/exp
+**/benchmark
+**/model_zoo
+*ckpt*
+log*
+*.puml.png
+*.puml.eps
+*.puml.svg
+default*
+events.*
+# DI-engine special key
+*default_logger.txt
+*default_tb_logger
+*evaluate.txt
+*total_config.py
+eval_config.py
+collect_demo_data_config.py
+!ding/**/*.py
+events.*
+evogym/*

DI-engine/.style.yapf ADDED Viewed

	@@ -0,0 +1,11 @@

+[style]
+# For explanation and more information: https://github.com/google/yapf
+BASED_ON_STYLE=pep8
+DEDENT_CLOSING_BRACKETS=True
+SPLIT_BEFORE_FIRST_ARGUMENT=True
+ALLOW_SPLIT_BEFORE_DICT_VALUE=False
+JOIN_MULTIPLE_LINES=False
+COLUMN_LIMIT=120
+BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF=True
+BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION=2
+SPACES_AROUND_POWER_OPERATOR=True

DI-engine/CHANGELOG ADDED Viewed

	@@ -0,0 +1,489 @@

+2023.11.06(v0.5.0)
+- env: add tabmwp env (#667)
+- env: polish anytrading env issues (#731)
+- algo: add PromptPG algorithm (#667)
+- algo: add Plan Diffuser algorithm (#700)
+- algo: add new pipeline implementation of IMPALA algorithm (#713)
+- algo: add dropout layers to DQN-style algorithms (#712)
+- feature: add new pipeline agent for sac/ddpg/a2c/ppo and Hugging Face support (#637) (#730) (#737)
+- feature: add more unittest cases for model (#728)
+- feature: add collector logging in new pipeline (#735)
+- fix: logger middleware problems (#715)
+- fix: ppo parallel bug (#709)
+- fix: typo in optimizer_helper.py (#726)
+- fix: mlp dropout if condition bug
+- fix: drex collecting data unittest bugs
+- style: polish env manager/wrapper comments and API doc (#742)
+- style: polish model comments and API doc (#722) (#729) (#734) (#736) (#741)
+- style: polish policy comments and API doc (#732)
+- style: polish rl_utils comments and API doc (#724)
+- style: polish torch_utils comments and API doc (#738)
+- style: update README.md and Colab demo (#733)
+- style: update metaworld docker image
+2023.08.23(v0.4.9)
+- env: add cliffwalking env (#677)
+- env: add lunarlander ppo config and example
+- algo: add BCQ offline RL algorithm (#640)
+- algo: add Dreamerv3 model-based RL algorithm (#652)
+- algo: add tensor stream merge network tools (#673)
+- algo: add scatter connection model (#680)
+- algo: refactor Decision Transformer in new pipeline and support img input and discrete output (#693)
+- algo: add three variants of Bilinear classes and a FiLM class (#703)
+- feature: polish offpolicy RL multi-gpu DDP training (#679)
+- feature: add middleware for Ape-X distributed pipeline (#696)
+- feature: add example for evaluating trained DQN (#706)
+- fix: to_ndarray fails to assign dtype for scalars (#708)
+- fix: evaluator return episode_info compatibility bug
+- fix: cql example entry wrong config bug
+- fix: enable_save_figure env interface
+- fix: redundant env info bug in evaluator
+- fix: to_item unittest bug
+- style: polish and simplify requirements (#672)
+- style: add Hugging Face Model Zoo badge (#674)
+- style: add openxlab Model Zoo badge (#675)
+- style: fix py37 macos ci bug and update default pytorch from 1.7.1 to 1.12.1 (#678)
+- style: fix mujoco-py compatibility issue for cython<3 (#711)
+- style: fix type spell error (#704)
+- style: fix pypi release actions ubuntu 18.04 bug
+- style: update contact information (e.g. wechat)
+- style: polish algorithm doc tables
+2023.05.25(v0.4.8)
+- env: fix gym hybrid reward dtype bug (#664)
+- env: fix atari env id noframeskip bug (#655)
+- env: fix typo in gym any_trading env (#654)
+- env: update td3bc d4rl config (#659)
+- env: polish bipedalwalker config
+- algo: add EDAC offline RL algorithm (#639)
+- algo: add LN and GN norm_type support in ResBlock (#660)
+- algo: add normal value norm baseline for PPOF (#658)
+- algo: polish last layer init/norm in MLP (#650)
+- algo: polish TD3 monitor variable
+- feature: add MAPPO/MASAC task example (#661)
+- feature: add PPO example for complex env observation (#644)
+- feature: add barrier middleware (#570)
+- fix: abnormal collector log and add record_random_collect option (#662)
+- fix: to_item compatibility bug (#646)
+- fix: trainer dtype transform compatibility bug
+- fix: pettingzoo 1.23.0 compatibility bug
+- fix: ensemble head unittest bug
+- style: fix incompatible gym version bug in Dockerfile.env (#653)
+- style: add more algorithm docs
+2023.04.11(v0.4.7)
+- env: add dmc2gym env support and baseline (#451)
+- env: update pettingzoo to the latest version (#597)
+- env: polish icm/rnd+onppo config bugs and add app_door_to_key env (#564)
+- env: add lunarlander continuous TD3/SAC config
+- env: polish lunarlander discrete C51 config
+- algo: add Procedure Cloning (PC) imitation learning algorithm (#514)
+- algo: add Munchausen Reinforcement Learning (MDQN) algorithm (#590)
+- algo: add reward/value norm methods: popart & value rescale & symlog (#605)
+- algo: polish reward model config and training pipeline (#624)
+- algo: add PPOF reward space demo support (#608)
+- algo: add PPOF Atari demo support (#589)
+- algo: polish dqn default config and env examples (#611)
+- algo: polish comment and clean code about SAC
+- feature: add language model (e.g. GPT) training utils (#625)
+- feature: remove policy cfg sub fields requirements (#620)
+- feature: add full wandb support (#579)
+- fix: confusing shallow copy operation about next_obs (#641)
+- fix: unsqueeze action_args in PDQN when shape is 1 (#599)
+- fix: evaluator return_info tensor type bug (#592)
+- fix: deque buffer wrapper PER bug (#586)
+- fix: reward model save method compatibility bug
+- fix: logger assertion and unittest bug
+- fix: bfs test py3.9 compatibility bug
+- fix: zergling collector unittest bug
+- style: add DI-engine torch-rpc p2p communication docker (#628)
+- style: add D4RL docker (#591)
+- style: correct typo in task (#617)
+- style: correct typo in time_helper (#602)
+- style: polish readme and add treetensor example
+- style: update contributing doc
+2023.02.16(v0.4.6)
+- env: add metadrive env and related ppo config (#574)
+- env: add acrobot env and related dqn config (#577)
+- env: add carracing in box2d (#575)
+- env: add new gym hybrid viz (#563)
+- env: update cartpole IL config (#578)
+- algo: add BDQ algorithm (#558)
+- algo: add procedure cloning model (#573)
+- feature: add simplified PPOF (PPO × Family) interface (#567) (#568) (#581) (#582)
+- fix: to_device and prev_state bug when using ttorch (#571)
+- fix: py38 and numpy unittest bugs (#565)
+- fix: typo in contrastive_loss.py (#572)
+- fix: dizoo envs pkg installation bugs
+- fix: multi_trainer middleware unittest bug
+- style: add evogym docker (#580)
+- style: fix metaworld docker bug
+- style: fix setuptools high version incompatibility bug
+- style: extend treetensor lowest version
+2022.12.13(v0.4.5)
+- env: add beergame supply chain optimization env (#512)
+- env: add env gym_pybullet_drones (#526)
+- env: rename eval reward to episode return (#536)
+- algo: add policy gradient algo implementation (#544)
+- algo: add MADDPG algo implementation (#550)
+- algo: add IMPALA continuous algo implementation (#551)
+- algo: add MADQN algo implementation (#540)
+- feature: add new task IMPALA-type distributed training scheme (#321)
+- feature: add load and save method for replaybuffer (#542)
+- feature: add more DingEnvWrapper example (#525)
+- feature: add evaluator more info viz support (#538)
+- feature: add trackback log for subprocess env manager (#534)
+- fix: halfcheetah td3 config file (#537)
+- fix: mujoco action_clip args compatibility bug (#535)
+- fix: atari a2c config entry bug
+- fix: drex unittest compatibility bug
+- style: add Roadmap issue of DI-engine (#548)
+- style: update related project link and new env doc
+2022.10.31(v0.4.4)
+- env: add modified gym-hybrid including moving, sliding and hardmove (#505) (#519)
+- env: add evogym support (#495) (#527)
+- env: add save_replay_gif option (#506)
+- env: adapt minigrid_env and related config to latest MiniGrid v2.0.0 (#500)
+- algo: add pcgrad optimizer (#489)
+- algo: add some features in MLP and ResBlock (#511)
+- algo: delete mcts related modules (#518)
+- feature: add wandb middleware and demo (#488) (#523) (#528)
+- feature: add new properties in Context (#499)
+- feature: add single env policy wrapper for policy deployment
+- feature: add custom model demo and doc
+- fix: build logger args and unittests (#522)
+- fix: total_loss calculation in PDQN (#504)
+- fix: save gif function bug
+- fix: level sample unittest bug
+- style: update contact email address (#503)
+- style: polish env log and resblock name
+- style: add details button in readme
+2022.09.23(v0.4.3)
+- env: add rule-based gomoku expert (#465)
+- algo: fix a2c policy batch size bug (#481)
+- algo: enable activation option in collaq attention and mixer
+- algo: minor fix about IBC (#477)
+- feature: add IGM support (#486)
+- feature: add tb logger middleware and demo
+- fix: the type conversion in ding_env_wrapper (#483)
+- fix: di-orchestrator version bug in unittest (#479)
+- fix: data collection errors caused by shallow copies (#475)
+- fix: gym==0.26.0 seed args bug
+- style: add readme tutorial link(environment & algorithm) (#490) (#493)
+- style: adjust location of the default_model method in policy (#453)
+2022.09.08(v0.4.2)
+- env: add rocket env (#449)
+- env: updated pettingzoo env and improved related performance (#457)
+- env: add mario env demo (#443)
+- env: add MAPPO multi-agent config (#464)
+- env: add mountain car (discrete action) environment (#452)
+- env: fix multi-agent mujoco gym comaptibility bug
+- env: fix gfootball env save_replay variable init bug
+- algo: add IBC (Implicit Behaviour Cloning) algorithm (#401)
+- algo: add BCO (Behaviour Cloning from Observation) algorithm (#270)
+- algo: add continuous PPOPG algorithm (#414)
+- algo: add PER in CollaQ (#472)
+- algo: add activation option in QMIX and CollaQ
+- feature: update ctx to dataclass (#467)
+- fix: base_env FinalMeta bug about gym 0.25.0-0.25.1
+- fix: config inplace modification bug
+- fix: ding cli no argument problem
+- fix: import errors after running setup.py (jinja2, markupsafe)
+- fix: conda py3.6 and cross platform build bug
+- style: add project state and datetime in log dir (#455)
+- style: polish notes for q-learning model (#427)
+- style: revision to mujoco dockerfile and validation (#474)
+- style: add dockerfile for cityflow env
+- style: polish default output log format
+2022.08.12(v0.4.1)
+- env: add gym trading env (#424)
+- env: add board games env (tictactoe, gomuku, chess) (#356)
+- env: add sokoban env (#397) (#429)
+- env: add BC and DQN demo for gfootball (#418) (#423)
+- env: add discrete pendulum env (#395)
+- algo: add STEVE model-based algorithm (#363)
+- algo: add PLR algorithm (#408)
+- algo: plugin ST-DIM in PPO (#379)
+- feature: add final result saving in training pipeline
+- fix: random policy randomness bug
+- fix: action_space seed compalbility bug
+- fix: discard message sent by self in redis mq (#354)
+- fix: remove pace controller (#400)
+- fix: import error in serial_pipeline_trex (#410)
+- fix: unittest hang and fail bug (#413)
+- fix: DREX collect data unittest bug
+- fix: remove unused import cv2
+- fix: ding CLI env/policy option bug
+- style: upgrade Python version from 3.6-3.8 to 3.7-3.9
+- style: upgrade gym version from 0.20.0 to 0.25.0
+- style: upgrade torch version from 1.10.0 to 1.12.0
+- style: upgrade mujoco bin from 2.0.0 to 2.1.0
+- style: add buffer api description (#371)
+- style: polish VAE comments (#404)
+- style: unittest for FQF (#412)
+- style: add metaworld dockerfile (#432)
+- style: remove opencv requirement in default setting
+- style: update long description in setup.py
+2022.06.21(v0.4.0)
+- env: add MAPPO/MASAC all configs in SMAC (#310) **(SOTA results in SMAC!!!)**
+- env: add dmc2gym env (#344) (#360)
+- env: remove DI-star requirements of dizoo/smac, use official pysc2 (#302)
+- env: add latest GAIL mujoco config (#298)
+- env: polish procgen env (#311)
+- env: add MBPO ant and humanoid config for mbpo (#314)
+- env: fix slime volley env obs space bug when agent_vs_agent
+- env: fix smac env obs space bug
+- env: fix import path error in lunarlander (#362)
+- algo: add Decision Transformer algorithm (#327) (#364)
+- algo: add on-policy PPG algorithm (#312)
+- algo: add DDPPO & add model-based SAC with lambda-return algorithm (#332)
+- algo: add infoNCE loss and ST-DIM algorithm (#326)
+- algo: add FQF distributional RL algorithm (#274)
+- algo: add continuous BC algorithm (#318）
+- algo: add pure policy gradient PPO algorithm (#382)
+- algo: add SQIL + SAC algorithm (#348)
+- algo: polish NGU and related modules (#283) (#343) (#353)
+- algo: add marl distributional td loss (#331)
+- feature: add new worker middleware (#236)
+- feature: refactor model-based RL pipeline (ding/world_model) (#332)
+- feature: refactor logging system in the whole DI-engine (#316)
+- feature: add env supervisor design (#330)
+- feature: support async reset for envpool env manager (#250)
+- feature: add log videos to tensorboard (#320)
+- feature: refactor impala cnn encoder interface (#378)
+- fix: env save replay bug
+- fix: transformer mask inplace operation bug
+- fix: transtion_with_policy_data bug in SAC and PPG
+- style: add dockerfile for ding:hpc image (#337)
+- style: fix mpire 2.3.5 which handles default processes more elegantly (#306)
+- style: use FORMAT_DIR instead of ./ding (#309）
+- style: update quickstart colab link (#347)
+- style: polish comments in ding/model/common (#315)
+- style: update mujoco docker download path (#386)
+- style: fix protobuf new version compatibility bug
+- style: fix torch1.8.0 torch.div compatibility bug
+- style: update doc links in readme
+- style: add outline in readme and update wechat image
+- style: update head image and refactor docker dir
+2022.04.23(v0.3.1)
+- env: polish and standardize dizoo config (#252) (#255) (#249) (#246) (#262) (#261) (#266) (#273) (#263) (#280) (#259) (#286) (#277) (#290) (#289) (#299)
+- env: add GRF academic env and config (#281)
+- env: update env inferface of GRF (#258)
+- env: update D4RL offline RL env and config (#285)
+- env: polish PomdpAtariEnv (#254)
+- algo: DREX algorithm (#218)
+- feature: separate mq and parallel modules, add redis (#247)
+- feature: rename env variables; fix attach_to parameter (#244)
+- feature: env implementation check (#275)
+- feature: adjust and set the max column number of tabulate in log (#296)
+- feature: add drop_extra option for sample collect
+- feature: speed up GTrXL forward method + GRU unittest (#253) (#292)
+- fix: add act_scale in DingEnvWrapper; fix envpool env manager (#245)
+- fix: auto_reset=False and env_ref bug in env manager (#248)
+- fix: data type and deepcopy bug in RND (#288)
+- fix: share_memory bug and multi_mujoco env (#279)
+- fix: some bugs in GTrXL (#276)
+- fix: update gym_vector_env_manager and add more unittest (#241)
+- fix: mdpolicy random collect bug (#293)
+- fix: gym.wrapper save video replay bug
+- fix: collect abnormal step format bug and add unittest
+- test: add buffer benchmark & socket test (#284)
+- style: upgrade mpire (#251)
+- style: add GRF(google research football) docker (#256)
+- style: update policy and gail comment
+2022.03.24(v0.3.0)
+- env: add bitfilp HER DQN benchmark (#192) (#193) (#197)
+- env: slime volley league training demo (#229)
+- algo: Gated TransformXL (GTrXL) algorithm (#136)
+- algo: TD3 + VAE(HyAR) latent action algorithm (#152)
+- algo: stochastic dueling network (#234)
+- algo: use log prob instead of using prob in ACER (#186)
+- feature: support envpool env manager (#228)
+- feature: add league main and other improvements in new framework (#177) (#214)
+- feature: add pace controller middleware in new framework (#198)
+- feature: add auto recover option in new framework (#242)
+- feature: add k8s parser in new framework (#243)
+- feature: support async event handler and logger (#213)
+- feautre: add grad norm calculator (#205)
+- feautre: add gym vector env manager (#147)
+- feautre: add train_iter and env_step in serial pipeline (#212)
+- feautre: add rich logger handler (#219) (#223) (#232)
+- feature: add naive lr_scheduler demo
+- refactor: new BaseEnv and DingEnvWrapper (#171) (#231) (#240)
+- polish: MAPPO and MASAC smac config (#209) (#239)
+- polish: QMIX smac config (#175)
+- polish: R2D2 atari config (#181)
+- polish: A2C atari config (#189)
+- polish: GAIL box2d and mujoco config (#188)
+- polish: ACER atari config (#180)
+- polish: SQIL atari config (#230)
+- polish: TREX atari/mujoco config
+- polish: IMPALA atari config
+- polish: MBPO/D4PG mujoco config
+- fix: random_collect compatible to episode collector (#190)
+- fix: remove default n_sample/n_episode value in policy config (#185)
+- fix: PDQN model bug on gpu device (#220)
+- fix: TREX algorithm CLI bug (#182)
+- fix: DQfD JE computation bug and move to AdamW optimizer (#191)
+- fix: pytest problem for parallel middleware (#211)
+- fix: mujoco numpy compatibility bug
+- fix: markupsafe 2.1.0 bug
+- fix: framework parallel module network emit bug
+- fix: mpire bug and disable algotest in py3.8
+- fix: lunarlander env import and env_id bug
+- fix: icm unittest repeat name bug
+- fix: buffer thruput close bug
+- test: resnet unittest (#199)
+- test: SAC/SQN unittest (#207)
+- test: CQL/R2D3/GAIL unittest (#201)
+- test: NGU td unittest (#210)
+- test: model wrapper unittest (#215)
+- test: MAQAC model unittest (#226)
+- style: add doc docker (#221)
+2022.01.01(v0.2.3)
+- env: add multi-agent mujoco env (#146)
+- env: add delay reward mujoco env (#145)
+- env: fix port conflict in gym_soccer (#139)
+- algo: MASAC algorithm (#112)
+- algo: TREX algorithm (#119) (#144)
+- algo: H-PPO hybrid action space algorithm (#140)
+- algo: residual link in R2D2 (#150)
+- algo: gumbel softmax (#169)
+- algo: move actor_head_type to action_space field
+- feature: new main pipeline and async/parallel framework (#142) (#166) (#168)
+- feature: refactor buffer, separate algorithm and storage (#129)
+- feature: cli in new pipeline(ditask) (#160)
+- feature: add multiprocess tblogger, fix circular reference problem (#156)
+- feature: add multiple seed cli
+- feature: polish eps_greedy_multinomial_sample in model_wrapper (#154)
+- fix: R2D3 abs priority problem (#158) (#161)
+- fix: multi-discrete action space policies random action bug (#167)
+- fix: doc generate bug with enum_tools (#155)
+- style: more comments about R2D2 (#149)
+- style: add doc about how to migrate a new env
+- style: add doc about env tutorial in dizoo
+- style: add conda auto release (#148)
+- style: udpate zh doc link
+- style: update kaggle tutorial link
+2021.12.03(v0.2.2)
+- env: apple key to door treasure env (#128)
+- env: add bsuite memory benchmark (#138)
+- env: polish atari impala config
+- algo: Guided Cost IRL algorithm (#57)
+- algo: ICM exploration algorithm (#41)
+- algo: MP-DQN hybrid action space algorithm (#131)
+- algo: add loss statistics and polish r2d3 pong config (#126)
+- feautre: add renew env mechanism in env manager and update timeout mechanism (#127) (#134)
+- fix: async subprocess env manager reset bug (#137)
+- fix: keepdims name bug in model wrapper
+- fix: on-policy ppo value norm bug
+- fix: GAE and RND unittest bug
+- fix: hidden state wrapper h tensor compatiblity
+- fix: naive buffer auto config create bug
+- style: add supporters list
+2021.11.22(v0.2.1)
+- env: gym-hybrid env (#86)
+- env: gym-soccer (HFO) env (#94)
+- env: Go-Bigger env baseline (#95)
+- env: add the bipedalwalker config of sac and ppo (#121)
+- algo: DQfD Imitation Learning algorithm (#48) (#98)
+- algo: TD3BC offline RL algorithm (#88)
+- algo: MBPO model-based RL algorithm (#113)
+- algo: PADDPG hybrid action space algorithm (#109)
+- algo: PDQN hybrid action space algorithm (#118)
+- algo: fix R2D2 bugs and produce benchmark, add naive NGU (#40)
+- algo: self-play training demo in slime_volley env (#23)
+- algo: add example of GAIL entry + config for mujoco (#114)
+- feature: enable arbitrary policy num in serial sample collector
+- feautre: add torch DataParallel for single machine multi-GPU
+- feature: add registry force_overwrite argument
+- feature: add naive buffer periodic thruput seconds argument
+- test: add pure docker setting test (#103)
+- test: add unittest for dataset and evaluator (#107)
+- test: add unittest for on-policy algorithm (#92)
+- test: add unittest for ppo and td (MARL case) (#89)
+- test: polish collector benchmark test
+- fix: target model wrapper hard reset bug
+- fix: fix learn state_dict target model bug
+- fix: ppo bugs and update atari ppo offpolicy config (#108)
+- fix: pyyaml version bug (#99)
+- fix: small fix on bsuite environment (#117)
+- fix: discrete cql unittest bug
+- fix: release workflow bug
+- fix: base policy model state_dict overlap bug
+- fix: remove on_policy option in dizoo config and entry
+- fix: remove torch in env
+- style: gym version > 0.20.0
+- style: torch version >= 1.1.0, <= 1.10.0
+- style: ale-py == 0.7.0
+2021.9.30(v0.2.0)
+- env: overcooked env (#20)
+- env: procgen env (#26)
+- env: modified predator env (#30)
+- env: d4rl env (#37)
+- env: imagenet dataset (#27)
+- env: bsuite env (#58)
+- env: move atari_py to ale-py
+- algo: SQIL algorithm (#25) (#44)
+- algo: CQL algorithm (discrete/continuous) (#37) (#68)
+- algo: MAPPO algorithm (#62)
+- algo: WQMIX algorithm (#24)
+- algo: D4PG algorithm (#76)
+- algo: update multi discrete policy(dqn, ppo, rainbow) (#51) (#72)
+- feature: image classification training pipeline (#27)
+- feature: add force_reproducibility option in subprocess env manager
+- feature: add/delete/restart replicas via cli for k8s
+- feautre: add league metric (trueskill and elo) (#22)
+- feature: add tb in naive buffer and modify tb in advanced buffer (#39)
+- feature: add k8s launcher and di-orchestrator launcher, add related unittest (#45) (#49)
+- feature: add hyper-parameter scheduler module (#38)
+- feautre: add plot function (#59)
+- fix: acer bug and update atari result (#21)
+- fix: mappo nan bug and dict obs cannot unsqueeze bug (#54)
+- fix: r2d2 hidden state and obs arange bug (#36) (#52)
+- fix: ppo bug when use dual_clip and adv > 0
+- fix: qmix double_q hidden state bug
+- fix: spawn context problem in interaction unittest (#69)
+- fix: formatted config no eval bug (#53)
+- fix: the catch statments that will never succeed and system proxy bug (#71) (#79)
+- fix: lunarlander config
+- fix: c51 head dimension mismatch bug
+- fix: mujoco config typo bug
+- fix: ppg atari config bug
+- fix: max use and priority update special branch bug in advanced_buffer
+- style: add docker deploy in github workflow (#70) (#78) (#80)
+- style: support PyTorch 1.9.0
+- style: add algo/env list in README
+- style: rename advanced_buffer register name to advanced
+2021.8.3(v0.1.1)
+- env: selfplay/league demo (#12)
+- env: pybullet env (#16)
+- env: minigrid env (#13)
+- env: atari enduro config (#11)
+- algo: on policy PPO (#9)
+- algo: ACER algorithm (#14)
+- feature: polish experiment directory structure (#10)
+- refactor: split doc to new repo (#4)
+- fix: atari env info action space bug
+- fix: env manager retry wrapper raise exception info bug
+- fix: dist entry disable-flask-log typo
+- style: codestyle optimization by lgtm (#7)
+- style: code/comment statistics badge
+- style: github CI workflow
+2021.7.8(v0.1.0)

DI-engine/CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,128 @@

+# Contributor Covenant Code of Conduct
+## Our Pledge
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+## Our Standards
+Examples of behavior that contributes to a positive environment for our
+community include:
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+  overall community
+Examples of unacceptable behavior include:
+* The use of sexualized language or imagery, and sexual attention or
+  advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+  address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Enforcement Responsibilities
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+## Scope
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+opendilab.contact@gmail.com.
+All complaints will be reviewed and investigated promptly and fairly.
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+## Enforcement Guidelines
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+### 1. Correction
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+### 2. Warning
+**Community Impact**: A violation through a single incident or series
+of actions.
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+### 3. Temporary Ban
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+### 4. Permanent Ban
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior,  harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at
+https://www.contributor-covenant.org/translations.

DI-engine/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,7 @@

+[Git Guide](https://di-engine-docs.readthedocs.io/en/latest/24_cooperation/git_guide.html)
+[GitHub Cooperation Guide](https://di-engine-docs.readthedocs.io/en/latest/24_cooperation/issue_pr.html)
+  - [Code Style](https://di-engine-docs.readthedocs.io/en/latest/21_code_style/index.html)
+  - [Unit Test](https://di-engine-docs.readthedocs.io/en/latest/22_test/index.html)
+  - [Code Review](https://di-engine-docs.readthedocs.io/en/latest/24_cooperation/issue_pr.html#pr-s-code-review)

DI-engine/LICENSE ADDED Viewed

	@@ -0,0 +1,202 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2017 Google Inc.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

DI-engine/Makefile ADDED Viewed

	@@ -0,0 +1,71 @@

+CI ?=
+# Directory variables
+DING_DIR   ?= ./ding
+DIZOO_DIR  ?= ./dizoo
+RANGE_DIR  ?=
+TEST_DIR   ?= $(if ${RANGE_DIR},${RANGE_DIR},${DING_DIR})
+COV_DIR    ?= $(if ${RANGE_DIR},${RANGE_DIR},${DING_DIR})
+FORMAT_DIR ?= $(if ${RANGE_DIR},${RANGE_DIR},${DING_DIR})
+PLATFORM_TEST_DIR   ?= $(if ${RANGE_DIR},${RANGE_DIR},${DING_DIR}/entry/tests/test_serial_entry.py ${DING_DIR}/entry/tests/test_serial_entry_onpolicy.py)
+# Workers command
+WORKERS         ?= 2
+WORKERS_COMMAND := $(if ${WORKERS},-n ${WORKERS} --dist=loadscope,)
+# Duration command
+DURATIONS         ?= 10
+DURATIONS_COMMAND := $(if ${DURATIONS},--durations=${DURATIONS},)
+docs:
+	$(MAKE) -C ${DING_DIR}/docs html
+unittest:
+	pytest ${TEST_DIR} \
+		--cov-report=xml \
+		--cov-report term-missing \
+		--cov=${COV_DIR} \
+		${DURATIONS_COMMAND} \
+		${WORKERS_COMMAND} \
+		-sv -m unittest \
+algotest:
+	pytest ${TEST_DIR} \
+		${DURATIONS_COMMAND} \
+		-sv -m algotest
+cudatest:
+	pytest ${TEST_DIR} \
+		-sv -m cudatest
+envpooltest:
+	pytest ${TEST_DIR} \
+		-sv -m envpooltest
+dockertest:
+	${DING_DIR}/scripts/docker-test-entry.sh
+platformtest:
+	pytest ${TEST_DIR} \
+		--cov-report term-missing \
+		--cov=${COV_DIR} \
+		${WORKERS_COMMAND} \
+		-sv -m platformtest
+benchmark:
+	pytest ${TEST_DIR} \
+		--durations=0 \
+		-sv -m benchmark
+test: unittest  # just for compatibility, can be changed later
+cpu_test: unittest algotest benchmark
+all_test: unittest algotest cudatest benchmark
+format:
+	yapf --in-place --recursive -p --verbose --style .style.yapf ${FORMAT_DIR}
+format_test:
+	bash format.sh ${FORMAT_DIR} --test
+flake_check:
+	flake8 ${FORMAT_DIR}

DI-engine/README.md ADDED Viewed

	@@ -0,0 +1,475 @@

+<div align="center">
+    <a href="https://di-engine-docs.readthedocs.io/en/latest/"><img width="1000px" height="auto" src="https://github.com/opendilab/DI-engine-docs/blob/main/source/images/head_image.png"></a>
+</div>
+---
+[![Twitter](https://img.shields.io/twitter/url?style=social&url=https%3A%2F%2Ftwitter.com%2Fopendilab)](https://twitter.com/opendilab)
+[![PyPI](https://img.shields.io/pypi/v/DI-engine)](https://pypi.org/project/DI-engine/)
+![Conda](https://anaconda.org/opendilab/di-engine/badges/version.svg)
+![Conda update](https://anaconda.org/opendilab/di-engine/badges/latest_release_date.svg)
+![PyPI - Python Version](https://img.shields.io/pypi/pyversions/DI-engine)
+![PyTorch Version](https://img.shields.io/badge/dynamic/json?color=blue&label=pytorch&query=%24.pytorchVersion&url=https%3A%2F%2Fgist.githubusercontent.com/PaParaZz1/54c5c44eeb94734e276b2ed5770eba8d/raw/85b94a54933a9369f8843cc2cea3546152a75661/badges.json)
+![Loc](https://img.shields.io/endpoint?url=https://gist.githubusercontent.com/HansBug/3690cccd811e4c5f771075c2f785c7bb/raw/loc.json)
+![Comments](https://img.shields.io/endpoint?url=https://gist.githubusercontent.com/HansBug/3690cccd811e4c5f771075c2f785c7bb/raw/comments.json)
+![Style](https://github.com/opendilab/DI-engine/actions/workflows/style.yml/badge.svg)
+[![Read en Docs](https://github.com/opendilab/DI-engine/actions/workflows/doc.yml/badge.svg)](https://di-engine-docs.readthedocs.io/en/latest)
+[![Read zh_CN Docs](https://img.shields.io/readthedocs/di-engine-docs?label=%E4%B8%AD%E6%96%87%E6%96%87%E6%A1%A3)](https://di-engine-docs.readthedocs.io/zh_CN/latest)
+![Unittest](https://github.com/opendilab/DI-engine/actions/workflows/unit_test.yml/badge.svg)
+![Algotest](https://github.com/opendilab/DI-engine/actions/workflows/algo_test.yml/badge.svg)
+![deploy](https://github.com/opendilab/DI-engine/actions/workflows/deploy.yml/badge.svg)
+[![codecov](https://codecov.io/gh/opendilab/DI-engine/branch/main/graph/badge.svg?token=B0Q15JI301)](https://codecov.io/gh/opendilab/DI-engine)
+![GitHub Org's stars](https://img.shields.io/github/stars/opendilab)
+[![GitHub stars](https://img.shields.io/github/stars/opendilab/DI-engine)](https://github.com/opendilab/DI-engine/stargazers)
+[![GitHub forks](https://img.shields.io/github/forks/opendilab/DI-engine)](https://github.com/opendilab/DI-engine/network)
+![GitHub commit activity](https://img.shields.io/github/commit-activity/m/opendilab/DI-engine)
+[![GitHub issues](https://img.shields.io/github/issues/opendilab/DI-engine)](https://github.com/opendilab/DI-engine/issues)
+[![GitHub pulls](https://img.shields.io/github/issues-pr/opendilab/DI-engine)](https://github.com/opendilab/DI-engine/pulls)
+[![Contributors](https://img.shields.io/github/contributors/opendilab/DI-engine)](https://github.com/opendilab/DI-engine/graphs/contributors)
+[![GitHub license](https://img.shields.io/github/license/opendilab/DI-engine)](https://github.com/opendilab/DI-engine/blob/master/LICENSE)
+[![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-yellow)](https://huggingface.co/OpenDILabCommunity)
+[![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models?search=opendilab)
+Updated on 2023.12.05 DI-engine-v0.5.0
+## Introduction to DI-engine
+[Documentation](https://di-engine-docs.readthedocs.io/en/latest/) | [中文文档](https://di-engine-docs.readthedocs.io/zh_CN/latest/) | [Tutorials](https://di-engine-docs.readthedocs.io/en/latest/01_quickstart/index.html) | [Feature](#feature) | [Task & Middleware](https://di-engine-docs.readthedocs.io/en/latest/03_system/index.html) | [TreeTensor](#general-data-container-treetensor) | [Roadmap](https://github.com/opendilab/DI-engine/issues/548)
+**DI-engine** is a generalized decision intelligence engine for PyTorch and JAX.
+It provides **python-first** and **asynchronous-native** task and middleware abstractions, and modularly integrates several of the most important decision-making concepts: Env, Policy and Model. Based on the above mechanisms, DI-engine supports **various [deep reinforcement learning](https://di-engine-docs.readthedocs.io/en/latest/10_concepts/index.html) algorithms** with superior performance, high efficiency, well-organized [documentation](https://di-engine-docs.readthedocs.io/en/latest/) and [unittest](https://github.com/opendilab/DI-engine/actions):
+- Most basic DRL algorithms: such as DQN, Rainbow, PPO, TD3, SAC, R2D2, IMPALA
+- Multi-agent RL algorithms: such as QMIX, WQMIX, MAPPO, HAPPO, ACE
+- Imitation learning algorithms (BC/IRL/GAIL): such as GAIL, SQIL, Guided Cost Learning, Implicit BC
+- Offline RL algorithms: BCQ, CQL, TD3BC, Decision Transformer, EDAC, Diffuser, Decision Diffuser, SO2
+- Model-based RL algorithms: SVG, STEVE, MBPO, DDPPO, DreamerV3, MuZero
+- Exploration algorithms: HER, RND, ICM, NGU
+- LLM + RL Algorithms: PPO-max, DPO, MPDPO
+- Other algorithms: such as PER, PLR, PCGrad
+**DI-engine** aims to **standardize different Decision Intelligence environments and applications**, supporting both academic research and prototype applications. Various training pipelines and customized decision AI applications are also supported:
+<details open>
+<summary>(Click to Collapse)</summary>
+- Traditional academic environments
+  - [DI-zoo](https://github.com/opendilab/DI-engine#environment-versatility): various decision intelligence demonstrations and benchmark environments with DI-engine.
+- Tutorial courses
+  - [PPOxFamily](https://github.com/opendilab/PPOxFamily): PPO x Family DRL Tutorial Course
+- Real world decision AI applications
+  - [DI-star](https://github.com/opendilab/DI-star): Decision AI in StarCraftII
+  - [DI-drive](https://github.com/opendilab/DI-drive): Auto-driving platform
+  - [DI-sheep](https://github.com/opendilab/DI-sheep): Decision AI in 3 Tiles Game
+  - [DI-smartcross](https://github.com/opendilab/DI-smartcross): Decision AI in Traffic Light Control
+  - [DI-bioseq](https://github.com/opendilab/DI-bioseq): Decision AI in Biological Sequence Prediction and Searching
+  - [DI-1024](https://github.com/opendilab/DI-1024): Deep Reinforcement Learning + 1024 Game
+- Research paper
+  - [InterFuser](https://github.com/opendilab/InterFuser): [CoRL 2022] Safety-Enhanced Autonomous Driving Using Interpretable Sensor Fusion Transformer
+  - [ACE](https://github.com/opendilab/ACE): [AAAI 2023] ACE: Cooperative Multi-agent Q-learning with Bidirectional Action-Dependency
+  - [GoBigger](https://github.com/opendilab/GoBigger): [ICLR 2023] Multi-Agent Decision Intelligence Environment
+  - [DOS](https://github.com/opendilab/DOS): [CVPR 2023] ReasonNet: End-to-End Driving with Temporal and Global Reasoning
+  - [LightZero](https://github.com/opendilab/LightZero): [NeurIPS 2023 Spotlight] A lightweight and efficient MCTS/AlphaZero/MuZero algorithm toolkit
+  - [SO2](https://github.com/opendilab/SO2): [AAAI 2024] A Perspective of Q-value Estimation on Offline-to-Online Reinforcement Learning
+  - [LMDrive](https://github.com/opendilab/LMDrive): LMDrive: Closed-Loop End-to-End Driving with Large Language Models
+- Docs and Tutorials
+  - [DI-engine-docs](https://github.com/opendilab/DI-engine-docs): Tutorials, best practice and the API reference.
+  - [awesome-model-based-RL](https://github.com/opendilab/awesome-model-based-RL): A curated list of awesome Model-Based RL resources
+  - [awesome-exploration-RL](https://github.com/opendilab/awesome-exploration-rl): A curated list of awesome exploration RL resources
+  - [awesome-decision-transformer](https://github.com/opendilab/awesome-decision-transformer): A curated list of Decision Transformer resources
+  - [awesome-RLHF](https://github.com/opendilab/awesome-RLHF): A curated list of reinforcement learning with human feedback resources
+  - [awesome-multi-modal-reinforcement-learning](https://github.com/opendilab/awesome-multi-modal-reinforcement-learning): A curated list of Multi-Modal Reinforcement Learning resources
+  - [awesome-AI-based-protein-design](https://github.com/opendilab/awesome-AI-based-protein-design): a collection of research papers for AI-based protein design
+  - [awesome-diffusion-model-in-rl](https://github.com/opendilab/awesome-diffusion-model-in-rl): A curated list of Diffusion Model in RL resources
+  - [awesome-end-to-end-autonomous-driving](https://github.com/opendilab/awesome-end-to-end-autonomous-driving): A curated list of awesome End-to-End Autonomous Driving resources
+  - [awesome-driving-behavior-prediction](https://github.com/opendilab/awesome-driving-behavior-prediction): A collection of research papers for Driving Behavior Prediction
+  </details>
+On the low-level end, DI-engine comes with a set of highly re-usable modules, including [RL optimization functions](https://github.com/opendilab/DI-engine/tree/main/ding/rl_utils), [PyTorch utilities](https://github.com/opendilab/DI-engine/tree/main/ding/torch_utils) and [auxiliary tools](https://github.com/opendilab/DI-engine/tree/main/ding/utils).
+BTW, **DI-engine** also has some special **system optimization and design** for efficient and robust large-scale RL training:
+<details close>
+<summary>(Click for Details)</summary>
+- [treevalue](https://github.com/opendilab/treevalue): Tree-nested data structure
+- [DI-treetensor](https://github.com/opendilab/DI-treetensor): Tree-nested PyTorch tensor Lib
+- [DI-toolkit](https://github.com/opendilab/DI-toolkit): A simple toolkit package for decision intelligence
+- [DI-orchestrator](https://github.com/opendilab/DI-orchestrator): RL Kubernetes Custom Resource and Operator Lib
+- [DI-hpc](https://github.com/opendilab/DI-hpc): RL HPC OP Lib
+- [DI-store](https://github.com/opendilab/DI-store): RL Object Store
+</details>
+Have fun with exploration and exploitation.
+## Outline
+- [Introduction to DI-engine](#introduction-to-di-engine)
+- [Outline](#outline)
+- [Installation](#installation)
+- [Quick Start](#quick-start)
+- [Feature](#feature)
+  - [Algorithm Versatility](#algorithm-versatility)
+  - [Environment Versatility](#environment-versatility)
+  - [General Data Container: TreeTensor](#general-data-container-treetensor)
+- [Feedback and Contribution](#feedback-and-contribution)
+- [Supporters](#supporters)
+  - [↳ Stargazers](#-stargazers)
+  - [↳ Forkers](#-forkers)
+- [Citation](#citation)
+- [License](#license)
+## Installation
+You can simply install DI-engine from PyPI with the following command:
+```bash
+pip install DI-engine
+```
+If you use Anaconda or Miniconda, you can install DI-engine from conda-forge through the following command:
+```bash
+conda install -c opendilab di-engine
+```
+For more information about installation, you can refer to [installation](https://di-engine-docs.readthedocs.io/en/latest/01_quickstart/installation.html).
+And our dockerhub repo can be found [here](https://hub.docker.com/repository/docker/opendilab/ding)，we prepare `base image` and `env image` with common RL environments.
+<details close>
+<summary>(Click for Details)</summary>
+- base: opendilab/ding:nightly
+- rpc: opendilab/ding:nightly-rpc
+- atari: opendilab/ding:nightly-atari
+- mujoco: opendilab/ding:nightly-mujoco
+- dmc: opendilab/ding:nightly-dmc2gym
+- metaworld: opendilab/ding:nightly-metaworld
+- smac: opendilab/ding:nightly-smac
+- grf: opendilab/ding:nightly-grf
+- cityflow: opendilab/ding:nightly-cityflow
+- evogym: opendilab/ding:nightly-evogym
+- d4rl: opendilab/ding:nightly-d4rl
+</details>
+The detailed documentation are hosted on [doc](https://di-engine-docs.readthedocs.io/en/latest/) | [中文文档](https://di-engine-docs.readthedocs.io/zh_CN/latest/).
+## Quick Start
+[3 Minutes Kickoff](https://di-engine-docs.readthedocs.io/en/latest/01_quickstart/first_rl_program.html)
+[3 Minutes Kickoff (colab)](https://colab.research.google.com/drive/1_7L-QFDfeCvMvLJzRyBRUW5_Q6ESXcZ4)
+[DI-engine Huggingface Kickoff (colab)](https://colab.research.google.com/drive/1UH1GQOjcHrmNSaW77hnLGxFJrLSLwCOk)
+[How to migrate a new **RL Env**](https://di-engine-docs.readthedocs.io/en/latest/11_dizoo/index.html) | [如何迁移一个新的**强化学习环境**](https://di-engine-docs.readthedocs.io/zh_CN/latest/11_dizoo/index_zh.html)
+[How to customize the neural network model](https://di-engine-docs.readthedocs.io/en/latest/04_best_practice/custom_model.html) | [如何定制策略使用的**神经网络模型**](https://di-engine-docs.readthedocs.io/zh_CN/latest/04_best_practice/custom_model_zh.html)
+[测试/部署 **强化学习策略** 的样例](https://github.com/opendilab/DI-engine/blob/main/dizoo/classic_control/cartpole/entry/cartpole_c51_deploy.py)
+[新老 pipeline 的异同对比](https://di-engine-docs.readthedocs.io/zh_CN/latest/04_best_practice/diff_in_new_pipeline_zh.html)
+## Feature
+### Algorithm Versatility
+<details open>
+<summary>(Click to Collapse)</summary>
+![discrete](https://img.shields.io/badge/-discrete-brightgreen) &nbsp;discrete means discrete action space, which is only label in normal DRL algorithms (1-23)
+![continuous](https://img.shields.io/badge/-continous-green) &nbsp;means continuous action space, which is only label in normal DRL algorithms (1-23)
+![hybrid](https://img.shields.io/badge/-hybrid-darkgreen) &nbsp;means hybrid (discrete + continuous) action space (1-23)
+![dist](https://img.shields.io/badge/-distributed-blue) &nbsp;[Distributed Reinforcement Learning](https://di-engine-docs.readthedocs.io/en/latest/02_algo/distributed_rl.html)｜[分布式强化学习](https://di-engine-docs.readthedocs.io/zh_CN/latest/02_algo/distributed_rl_zh.html)
+![MARL](https://img.shields.io/badge/-MARL-yellow) &nbsp;[Multi-Agent Reinforcement Learning](https://di-engine-docs.readthedocs.io/en/latest/02_algo/multi_agent_cooperation_rl.html)｜[多智能体强化学习](https://di-engine-docs.readthedocs.io/zh_CN/latest/02_algo/multi_agent_cooperation_rl_zh.html)
+![exp](https://img.shields.io/badge/-exploration-orange) &nbsp;[Exploration Mechanisms in Reinforcement Learning](https://di-engine-docs.readthedocs.io/en/latest/02_algo/exploration_rl.html)｜[强化学习中的探索机制](https://di-engine-docs.readthedocs.io/zh_CN/latest/02_algo/exploration_rl_zh.html)
+![IL](https://img.shields.io/badge/-IL-purple) &nbsp;[Imitation Learning](https://di-engine-docs.readthedocs.io/en/latest/02_algo/imitation_learning.html)｜[模仿学习](https://di-engine-docs.readthedocs.io/zh_CN/latest/02_algo/imitation_learning_zh.html)
+![offline](https://img.shields.io/badge/-offlineRL-darkblue) &nbsp;[Offiline Reinforcement Learning](https://di-engine-docs.readthedocs.io/en/latest/02_algo/offline_rl.html)｜[离线强化学习](https://di-engine-docs.readthedocs.io/zh_CN/latest/02_algo/offline_rl_zh.html)
+![mbrl](https://img.shields.io/badge/-ModelBasedRL-lightblue) &nbsp;[Model-Based Reinforcement Learning](https://di-engine-docs.readthedocs.io/en/latest/02_algo/model_based_rl.html)｜[基于模型的强化学习](https://di-engine-docs.readthedocs.io/zh_CN/latest/02_algo/model_based_rl_zh.html)
+![other](https://img.shields.io/badge/-other-lightgrey) &nbsp;means other sub-direction algorithms, usually as plugin-in in the whole pipeline
+P.S: The `.py` file in `Runnable Demo` can be found in `dizoo`
+|  No.  |                          Algorithm                           |                            Label                             |                        Doc and Implementation                        |                        Runnable Demo                         |
+| :--: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
+|  1   |         [DQN](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | [DQN doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/dqn.html)<br>[DQN中文文档](https://di-engine-docs.readthedocs.io/zh_CN/latest/12_policies/dqn_zh.html)<br>[policy/dqn](https://github.com/opendilab/DI-engine/blob/main/ding/policy/dqn.py) | python3 -u cartpole_dqn_main.py / ding -m serial -c cartpole_dqn_config.py -s 0 |
+|  2   |         [C51](https://arxiv.org/pdf/1707.06887.pdf)          | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | [C51 doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/c51.html)<br>[policy/c51](https://github.com/opendilab/DI-engine/blob/main/ding/policy/c51.py) |        ding -m serial -c cartpole_c51_config.py -s 0         |
+|  3   |         [QRDQN](https://arxiv.org/pdf/1710.10044.pdf)        | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | [QRDQN doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/qrdqn.html)<br>[policy/qrdqn](https://github.com/opendilab/DI-engine/blob/main/ding/policy/qrdqn.py) |       ding -m serial -c cartpole_qrdqn_config.py -s 0        |
+|  4   |         [IQN](https://arxiv.org/pdf/1806.06923.pdf)          | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | [IQN doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/iqn.html)<br>[policy/iqn](https://github.com/opendilab/DI-engine/blob/main/ding/policy/iqn.py) |        ding -m serial -c cartpole_iqn_config.py -s 0         |
+|  5   |         [FQF](https://arxiv.org/pdf/1911.02140.pdf)          | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | [FQF doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/fqf.html)<br>[policy/fqf](https://github.com/opendilab/DI-engine/blob/main/ding/policy/fqf.py) |        ding -m serial -c cartpole_fqf_config.py -s 0         |
+|  6   |         [Rainbow](https://arxiv.org/pdf/1710.02298.pdf)          | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | [Rainbow doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/rainbow.html)<br>[policy/rainbow](https://github.com/opendilab/DI-engine/blob/main/ding/policy/rainbow.py) |      ding -m serial -c cartpole_rainbow_config.py -s 0       |
+|  7   |         [SQL](https://arxiv.org/pdf/1702.08165.pdf)          | ![discrete](https://img.shields.io/badge/-discrete-brightgreen)![continuous](https://img.shields.io/badge/-continous-green) | [SQL doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/sql.html)<br>[policy/sql](https://github.com/opendilab/DI-engine/blob/main/ding/policy/sql.py) |        ding -m serial -c cartpole_sql_config.py -s 0         |
+|  8   |         [R2D2](https://openreview.net/forum?id=r1lyTjAqYX)      | ![dist](https://img.shields.io/badge/-distributed-blue)![discrete](https://img.shields.io/badge/-discrete-brightgreen) | [R2D2 doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/r2d2.html)<br>[policy/r2d2](https://github.com/opendilab/DI-engine/blob/main/ding/policy/r2d2.py) |        ding -m serial -c cartpole_r2d2_config.py -s 0        |
+|  9   |         [PG](https://proceedings.neurips.cc/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf)            | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | [PG doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/a2c.html)<br>[policy/pg](https://github.com/opendilab/DI-engine/blob/main/ding/policy/pg.py) |        ding -m serial -c cartpole_pg_config.py -s 0         |
+| 10 | [PromptPG](https://arxiv.org/abs/2209.14610) | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | [policy/prompt_pg](https://github.com/opendilab/DI-engine/blob/main/ding/policy/prompt_pg.py) | ding -m serial_onpolicy -c tabmwp_pg_config.py -s 0 |
+|  11  |         [A2C](https://arxiv.org/pdf/1602.01783.pdf)            | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | [A2C doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/a2c.html)<br>[policy/a2c](https://github.com/opendilab/DI-engine/blob/main/ding/policy/a2c.py) |        ding -m serial -c cartpole_a2c_config.py -s 0         |
+|  12  |         [PPO](https://arxiv.org/abs/1707.06347)/[MAPPO](https://arxiv.org/pdf/2103.01955.pdf)         | ![discrete](https://img.shields.io/badge/-discrete-brightgreen)![continuous](https://img.shields.io/badge/-continous-green)![MARL](https://img.shields.io/badge/-MARL-yellow) | [PPO doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/ppo.html)<br>[policy/ppo](https://github.com/opendilab/DI-engine/blob/main/ding/policy/ppo.py) | python3 -u cartpole_ppo_main.py / ding -m serial_onpolicy -c cartpole_ppo_config.py -s 0 |
+|  13  |         [PPG](https://arxiv.org/pdf/2009.04416.pdf)          | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | [PPG doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/ppg.html)<br>[policy/ppg](https://github.com/opendilab/DI-engine/blob/main/ding/policy/ppg.py) |               python3 -u cartpole_ppg_main.py                |
+|  14  |         [ACER](https://arxiv.org/pdf/1611.01224.pdf)         | ![discrete](https://img.shields.io/badge/-discrete-brightgreen)![continuous](https://img.shields.io/badge/-continous-green) | [ACER doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/acer.html)<br>[policy/acer](https://github.com/opendilab/DI-engine/blob/main/ding/policy/acer.py) |        ding -m serial -c cartpole_acer_config.py -s 0        |
+|  15  |          [IMPALA](https://arxiv.org/abs/1802.01561)          | ![dist](https://img.shields.io/badge/-distributed-blue)![discrete](https://img.shields.io/badge/-discrete-brightgreen) | [IMPALA doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/impala.html)<br>[policy/impala](https://github.com/opendilab/DI-engine/blob/main/ding/policy/impala.py) |       ding -m serial -c cartpole_impala_config.py -s 0       |
+|  16  |         [DDPG](https://arxiv.org/pdf/1509.02971.pdf)/[PADDPG](https://arxiv.org/pdf/1511.04143.pdf)         | ![continuous](https://img.shields.io/badge/-continous-green)![hybrid](https://img.shields.io/badge/-hybrid-darkgreen) | [DDPG doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html)<br>[policy/ddpg](https://github.com/opendilab/DI-engine/blob/main/ding/policy/ddpg.py) |        ding -m serial -c pendulum_ddpg_config.py -s 0        |
+|  17  |         [TD3](https://arxiv.org/pdf/1802.09477.pdf)          | ![continuous](https://img.shields.io/badge/-continous-green)![hybrid](https://img.shields.io/badge/-hybrid-darkgreen) | [TD3 doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/td3.html)<br>[policy/td3](https://github.com/opendilab/DI-engine/blob/main/ding/policy/td3.py) | python3 -u pendulum_td3_main.py / ding -m serial -c pendulum_td3_config.py -s 0 |
+|  18  | [D4PG](https://arxiv.org/pdf/1804.08617.pdf) | ![continuous](https://img.shields.io/badge/-continous-green) | [D4PG doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/d4pg.html)<br>[policy/d4pg](https://github.com/opendilab/DI-engine/blob/main/ding/policy/d4pg.py) | python3 -u pendulum_d4pg_config.py |
+|  19  |           [SAC](https://arxiv.org/abs/1801.01290)/[MASAC]            | ![discrete](https://img.shields.io/badge/-discrete-brightgreen)![continuous](https://img.shields.io/badge/-continous-green)![MARL](https://img.shields.io/badge/-MARL-yellow) | [SAC doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/sac.html)<br>[policy/sac](https://github.com/opendilab/DI-engine/blob/main/ding/policy/sac.py) |        ding -m serial -c pendulum_sac_config.py -s 0         |
+|  20  | [PDQN](https://arxiv.org/pdf/1810.06394.pdf) | ![hybrid](https://img.shields.io/badge/-hybrid-darkgreen) | [policy/pdqn](https://github.com/opendilab/DI-engine/blob/main/ding/policy/pdqn.py) | ding -m serial -c gym_hybrid_pdqn_config.py -s 0 |
+|  21  | [MPDQN](https://arxiv.org/pdf/1905.04388.pdf) | ![hybrid](https://img.shields.io/badge/-hybrid-darkgreen) | [policy/pdqn](https://github.com/opendilab/DI-engine/blob/main/ding/policy/pdqn.py) | ding -m serial -c gym_hybrid_mpdqn_config.py -s 0 |
+|  22  | [HPPO](https://arxiv.org/pdf/1903.01344.pdf) | ![hybrid](https://img.shields.io/badge/-hybrid-darkgreen) | [policy/ppo](https://github.com/opendilab/DI-engine/blob/main/ding/policy/ppo.py) | ding -m serial_onpolicy -c gym_hybrid_hppo_config.py -s 0 |
+|  23  |         [BDQ](https://arxiv.org/pdf/1711.08946.pdf)          |   ![hybrid](https://img.shields.io/badge/-hybrid-darkgreen)    | [policy/bdq](https://github.com/opendilab/DI-engine/blob/main/ding/policy/dqn.py) |        python3 -u hopper_bdq_config.py       |
+|  24  |         [MDQN](https://arxiv.org/abs/2007.14430)          |   ![discrete](https://img.shields.io/badge/-discrete-brightgreen)    | [policy/mdqn](https://github.com/opendilab/DI-engine/blob/main/ding/policy/mdqn.py) |        python3 -u asterix_mdqn_config.py       |
+|  25  |           [QMIX](https://arxiv.org/pdf/1803.11485.pdf)           |      ![MARL](https://img.shields.io/badge/-MARL-yellow)      | [QMIX doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/qmix.html)<br>[policy/qmix](https://github.com/opendilab/DI-engine/blob/main/ding/policy/qmix.py) |       ding -m serial -c smac_3s5z_qmix_config.py -s 0        |
+|  26  |         [COMA](https://arxiv.org/pdf/1705.08926.pdf)         |      ![MARL](https://img.shields.io/badge/-MARL-yellow)      | [COMA doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/coma.html)<br>[policy/coma](https://github.com/opendilab/DI-engine/blob/main/ding/policy/coma.py) |       ding -m serial -c smac_3s5z_coma_config.py -s 0        |
+|  27  |          [QTran](https://arxiv.org/abs/1905.05408)           |      ![MARL](https://img.shields.io/badge/-MARL-yellow)      | [policy/qtran](https://github.com/opendilab/DI-engine/blob/main/ding/policy/qtran.py) |       ding -m serial -c smac_3s5z_qtran_config.py -s 0       |
+|  28  |          [WQMIX](https://arxiv.org/abs/2006.10800)           |      ![MARL](https://img.shields.io/badge/-MARL-yellow)      | [WQMIX doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/wqmix.html)<br>[policy/wqmix](https://github.com/opendilab/DI-engine/blob/main/ding/policy/wqmix.py) |       ding -m serial -c smac_3s5z_wqmix_config.py -s 0       |
+|  29  |        [CollaQ](https://arxiv.org/pdf/2010.08531.pdf)        |      ![MARL](https://img.shields.io/badge/-MARL-yellow)      | [CollaQ doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/collaq.html)<br>[policy/collaq](https://github.com/opendilab/DI-engine/blob/main/ding/policy/collaq.py) |      ding -m serial -c smac_3s5z_collaq_config.py -s 0       |
+|  30  |        [MADDPG](https://arxiv.org/pdf/1706.02275.pdf)        |      ![MARL](https://img.shields.io/badge/-MARL-yellow)      | [MADDPG doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html)<br>[policy/ddpg](https://github.com/opendilab/DI-engine/blob/main/ding/policy/ddpg.py) |      ding -m serial -c ant_maddpg_config.py -s 0       |
+|  31  |           [GAIL](https://arxiv.org/pdf/1606.03476.pdf)           |        ![IL](https://img.shields.io/badge/-IL-purple)        | [GAIL doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/gail.html)<br>[reward_model/gail](https://github.com/opendilab/DI-engine/blob/main/ding/reward_model/gail_irl_model.py) |  ding -m serial_gail -c cartpole_dqn_gail_config.py -s 0  |
+|  32  |         [SQIL](https://arxiv.org/pdf/1905.11108.pdf)         |        ![IL](https://img.shields.io/badge/-IL-purple)        | [SQIL doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/sqil.html)<br>[entry/sqil](https://github.com/opendilab/DI-engine/blob/main/ding/entry/serial_entry_sqil.py) |     ding -m serial_sqil -c cartpole_sqil_config.py -s 0      |
+|  33  | [DQFD](https://arxiv.org/pdf/1704.03732.pdf) | ![IL](https://img.shields.io/badge/-IL-purple) | [DQFD doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/dqfd.html)<br>[policy/dqfd](https://github.com/opendilab/DI-engine/blob/main/ding/policy/dqfd.py) | ding -m serial_dqfd -c cartpole_dqfd_config.py -s 0 |
+|  34  | [R2D3](https://arxiv.org/pdf/1909.01387.pdf) | ![IL](https://img.shields.io/badge/-IL-purple) | [R2D3 doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/r2d3.html)<br>[R2D3中文文档](https://di-engine-docs.readthedocs.io/zh_CN/latest/12_policies/r2d3_zh.html)<br>[policy/r2d3](https://di-engine-docs.readthedocs.io/zh_CN/latest/12_policies/r2d3_zh.html) | python3 -u pong_r2d3_r2d2expert_config.py |
+|  35  |     [Guided Cost Learning](https://arxiv.org/pdf/1603.00448.pdf)     |   ![IL](https://img.shields.io/badge/-IL-purple)             | [Guided Cost Learning中文文档](https://di-engine-docs.readthedocs.io/zh_CN/latest/12_policies/guided_cost_zh.html)<br>[reward_model/guided_cost](https://github.com/opendilab/DI-engine/blob/main/ding/reward_model/guided_cost_reward_model.py) |                          python3 lunarlander_gcl_config.py   |
+|  36  |         [TREX](https://arxiv.org/abs/1904.06387)          |   ![IL](https://img.shields.io/badge/-IL-purple)             | [TREX doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/trex.html)<br>[reward_model/trex](https://github.com/opendilab/DI-engine/blob/main/ding/reward_model/trex_reward_model.py) |                          python3 mujoco_trex_main.py   |
+|  37  |         [Implicit Behavorial Cloning](https://implicitbc.github.io/) (DFO+MCMC)          |   ![IL](https://img.shields.io/badge/-IL-purple)    | [policy/ibc](https://github.com/opendilab/DI-engine/blob/main/ding/policy/ibc.py) <br> [model/template/ebm](https://github.com/opendilab/DI-engine/blob/main/ding/model/template/ebm.py) |        python3 d4rl_ibc_main.py -s 0 -c pen_human_ibc_mcmc_config.py  |
+|  38  |         [BCO](https://arxiv.org/pdf/1805.01954.pdf)          | ![IL](https://img.shields.io/badge/-IL-purple) | [entry/bco](https://github.com/opendilab/DI-engine/blob/main/ding/entry/serial_entry_bco.py) |                python3 -u cartpole_bco_config.py                 |
+|  39  |           [HER](https://arxiv.org/pdf/1707.01495.pdf)            |   ![exp](https://img.shields.io/badge/-exploration-orange)   | [HER doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/her.html)<br>[reward_model/her](https://github.com/opendilab/DI-engine/blob/main/ding/reward_model/her_reward_model.py) |                python3 -u bitflip_her_dqn.py                 |
+|  40  |           [RND](https://arxiv.org/abs/1810.12894)            |   ![exp](https://img.shields.io/badge/-exploration-orange)   | [RND doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/rnd.html)<br>[reward_model/rnd](https://github.com/opendilab/DI-engine/blob/main/ding/reward_model/rnd_reward_model.py) |             python3 -u cartpole_rnd_onppo_config.py           |
+|  41  |           [ICM](https://arxiv.org/pdf/1705.05363.pdf)            |   ![exp](https://img.shields.io/badge/-exploration-orange)   | [ICM doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/icm.html)<br>[ICM中文文档](https://di-engine-docs.readthedocs.io/zh_CN/latest/12_policies/icm_zh.html)<br>[reward_model/icm](https://github.com/opendilab/DI-engine/blob/main/ding/reward_model/icm_reward_model.py) |             python3 -u cartpole_ppo_icm_config.py              |
+|  42  |         [CQL](https://arxiv.org/pdf/2006.04779.pdf)          | ![offline](https://img.shields.io/badge/-offlineRL-darkblue) | [CQL doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/cql.html)<br>[policy/cql](https://github.com/opendilab/DI-engine/blob/main/ding/policy/cql.py) |                 python3 -u d4rl_cql_main.py                  |
+|  43  |         [TD3BC](https://arxiv.org/pdf/2106.06860.pdf)          | ![offline](https://img.shields.io/badge/-offlineRL-darkblue) | [TD3BC doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/td3_bc.html)<br>[policy/td3_bc](https://github.com/opendilab/DI-engine/blob/main/ding/policy/td3_bc.py) |                 python3 -u d4rl_td3_bc_main.py                  |
+|  44  |         [Decision Transformer](https://arxiv.org/pdf/2106.01345.pdf)          | ![offline](https://img.shields.io/badge/-offlineRL-darkblue) | [policy/dt](https://github.com/opendilab/DI-engine/blob/main/ding/policy/dt.py) |                 python3 -u d4rl_dt_mujoco.py                |
+|  45  |         [EDAC](https://arxiv.org/pdf/2110.01548.pdf)          | ![offline](https://img.shields.io/badge/-offlineRL-darkblue) | [EDAC doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/edac.html)<br>[policy/edac](https://github.com/opendilab/DI-engine/blob/main/ding/policy/edac.py) |                 python3 -u d4rl_edac_main.py                  |
+|  46  |         MBSAC([SAC](https://arxiv.org/abs/1801.01290)+[MVE](https://arxiv.org/abs/1803.00101)+[SVG](https://arxiv.org/abs/1510.09142))         | ![continuous](https://img.shields.io/badge/-continous-green)![mbrl](https://img.shields.io/badge/-ModelBasedRL-lightblue) | [policy/mbpolicy/mbsac](https://github.com/opendilab/DI-engine/blob/main/ding/policy/mbpolicy/mbsac.py) |        python3 -u pendulum_mbsac_mbpo_config.py \ python3 -u pendulum_mbsac_ddppo_config.py    |
+|  47  |         STEVESAC([SAC](https://arxiv.org/abs/1801.01290)+[STEVE](https://arxiv.org/abs/1807.01675)+[SVG](https://arxiv.org/abs/1510.09142))         | ![continuous](https://img.shields.io/badge/-continous-green)![mbrl](https://img.shields.io/badge/-ModelBasedRL-lightblue) | [policy/mbpolicy/mbsac](https://github.com/opendilab/DI-engine/blob/main/ding/policy/mbpolicy/mbsac.py) |        python3 -u pendulum_stevesac_mbpo_config.py    |
+|  48  |         [MBPO](https://arxiv.org/pdf/1906.08253.pdf)         | ![mbrl](https://img.shields.io/badge/-ModelBasedRL-lightblue) | [MBPO doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/mbpo.html)<br>[world_model/mbpo](https://github.com/opendilab/DI-engine/blob/main/ding/world_model/mbpo.py) |        python3 -u pendulum_sac_mbpo_config.py    |
+|  49  |         [DDPPO](https://openreview.net/forum?id=rzvOQrnclO0)         | ![mbrl](https://img.shields.io/badge/-ModelBasedRL-lightblue) | [world_model/ddppo](https://github.com/opendilab/DI-engine/blob/main/ding/world_model/ddppo.py) |        python3 -u pendulum_mbsac_ddppo_config.py    |
+|  50  |         [DreamerV3](https://arxiv.org/pdf/2301.04104.pdf)         | ![mbrl](https://img.shields.io/badge/-ModelBasedRL-lightblue) | [world_model/dreamerv3](https://github.com/opendilab/DI-engine/blob/main/ding/world_model/dreamerv3.py) |        python3 -u cartpole_balance_dreamer_config.py    |
+|  51  |         [PER](https://arxiv.org/pdf/1511.05952.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [worker/replay_buffer](https://github.com/opendilab/DI-engine/blob/main/ding/worker/replay_buffer/advanced_buffer.py) |                        `rainbow demo`                        |
+|  52  |         [GAE](https://arxiv.org/pdf/1506.02438.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [rl_utils/gae](https://github.com/opendilab/DI-engine/blob/main/ding/rl_utils/gae.py) |                          `ppo demo`                          |
+|  53  |         [ST-DIM](https://arxiv.org/pdf/1906.08226.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [torch_utils/loss/contrastive_loss](https://github.com/opendilab/DI-engine/blob/main/ding/torch_utils/loss/contrastive_loss.py) |        ding -m serial -c cartpole_dqn_stdim_config.py -s 0       |
+|  54  |         [PLR](https://arxiv.org/pdf/2010.03934.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [PLR doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/plr.html)<br>[data/level_replay/level_sampler](https://github.com/opendilab/DI-engine/blob/main/ding/data/level_replay/level_sampler.py) |        python3 -u bigfish_plr_config.py -s 0       |
+|  55  |         [PCGrad](https://arxiv.org/pdf/2001.06782.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [torch_utils/optimizer_helper/PCGrad](https://github.com/opendilab/DI-engine/blob/main/ding/data/torch_utils/optimizer_helper.py) |        python3 -u multi_mnist_pcgrad_main.py -s 0       |
+</details>
+### Environment Versatility
+<details open>
+<summary>(Click to Collapse)</summary>
+|  No  |                Environment               |                 Label               |         Visualization            |                   Code and Doc Links                   |
+| :--: | :--------------------------------------: | :---------------------------------: | :--------------------------------:|:---------------------------------------------------------: |
+|  1   |       [Atari](https://github.com/openai/gym/tree/master/gym/envs/atari)    | ![discrete](https://img.shields.io/badge/-discrete-brightgreen)   | ![original](./dizoo/atari/atari.gif)     |        [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/atari/envs) <br>[env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/atari.html)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/atari_zh.html)        |
+|  2   |       [box2d/bipedalwalker](https://github.com/openai/gym/tree/master/gym/envs/box2d)    | ![continuous](https://img.shields.io/badge/-continous-green) | ![original](./dizoo/box2d/bipedalwalker/original.gif)        | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/box2d/bipedalwalker/envs)<br>[env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/bipedalwalker.html)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/bipedalwalker_zh.html) |
+|  3   |       [box2d/lunarlander](https://github.com/openai/gym/tree/master/gym/envs/box2d)      | ![discrete](https://img.shields.io/badge/-discrete-brightgreen)   | ![original](./dizoo/box2d/lunarlander/lunarlander.gif)   |  [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/box2d/lunarlander/envs)<br>[env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/lunarlander_zh.html)  |
+|  4   |       [classic_control/cartpole](https://github.com/openai/gym/tree/master/gym/envs/classic_control)       | ![discrete](https://img.shields.io/badge/-discrete-brightgreen)   | ![original](./dizoo/classic_control/cartpole/cartpole.gif)    | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/classic_control/cartpole/envs)<br>[env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/cartpole.html)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/cartpole_zh.html) |
+|  5   |       [classic_control/pendulum](https://github.com/openai/gym/tree/master/gym/envs/classic_control)       | ![continuous](https://img.shields.io/badge/-continous-green) | ![original](./dizoo/classic_control/pendulum/pendulum.gif)    | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/classic_control/pendulum/envs)<br>[env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/pendulum.html)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/pendulum_zh.html) |
+|  6   |       [competitive_rl](https://github.com/cuhkrlcourse/competitive-rl)       | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) ![selfplay](https://img.shields.io/badge/-selfplay-blue) | ![original](./dizoo/competitive_rl/competitive_rl.gif)   |  [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo.classic_control)<br>[环境指南](https://di-engine-docs.readthedocs.io/en/latest/13_envs/competitive_rl_zh.html)  |
+|  7   |       [gfootball](https://github.com/google-research/football)                        | ![discrete](https://img.shields.io/badge/-discrete-brightgreen)![sparse](https://img.shields.io/badge/-sparse%20reward-orange)![selfplay](https://img.shields.io/badge/-selfplay-blue) | ![original](./dizoo/gfootball/gfootball.gif)      | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo.gfootball/envs)<br>[env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/gfootball.html)<br>[环境指南](https://di-engine-docs.readthedocs.io/en/latest/13_envs/gfootball_zh.html) |
+|  8   |       [minigrid](https://github.com/maximecb/gym-minigrid)                         | ![discrete](https://img.shields.io/badge/-discrete-brightgreen)![sparse](https://img.shields.io/badge/-sparse%20reward-orange) | ![original](./dizoo/minigrid/minigrid.gif)         | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/minigrid/envs)<br>[env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/minigrid.html)<br>[环境指南](https://di-engine-docs.readthedocs.io/en/latest/13_envs/minigrid_zh.html) |
+|  9   |       [MuJoCo](https://github.com/openai/gym/tree/master/gym/envs/mujoco)       |  ![continuous](https://img.shields.io/badge/-continous-green)  | ![original](./dizoo/mujoco/mujoco.gif)                    | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/majoco/envs)<br>[env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html)<br>[环境指南](https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco_zh.html) |
+|  10  |       [PettingZoo](https://github.com/Farama-Foundation/PettingZoo)         | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) ![continuous](https://img.shields.io/badge/-continous-green) ![marl](https://img.shields.io/badge/-MARL-yellow)  | ![original](./dizoo/petting_zoo/petting_zoo_mpe_simple_spread.gif)     |  [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/petting_zoo/envs)<br>[env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/pettingzoo.html)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/pettingzoo_zh.html)  |
+|  11  |       [overcooked](https://github.com/HumanCompatibleAI/overcooked-demo)     | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) ![marl](https://img.shields.io/badge/-MARL-yellow)  | ![original](./dizoo/overcooked/overcooked.gif)       |   [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/overcooded/envs)<br>[env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/overcooked.html)   |
+|  12  |       [procgen](https://github.com/openai/procgen)                          | ![discrete](https://img.shields.io/badge/-discrete-brightgreen)   | ![original](./dizoo/procgen/coinrun.gif) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/procgen)<br>[env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/procgen.html)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/procgen_zh.html) |
+|  13  |       [pybullet](https://github.com/benelot/pybullet-gym)    | ![continuous](https://img.shields.io/badge/-continous-green)  | ![original](./dizoo/pybullet/pybullet.gif)       | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/pybullet/envs)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/pybullet_zh.html) |
+|  14  |       [smac](https://github.com/oxwhirl/smac)     | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) ![marl](https://img.shields.io/badge/-MARL-yellow)![selfplay](https://img.shields.io/badge/-selfplay-blue)![sparse](https://img.shields.io/badge/-sparse%20reward-orange) | ![original](./dizoo/smac/smac.gif)       | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/smac/envs)<br>[env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/smac.html)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/smac_zh.html) |
+| 15 | [d4rl](https://github.com/rail-berkeley/d4rl) | ![offline](https://img.shields.io/badge/-offlineRL-darkblue) | ![ori](dizoo/d4rl/d4rl.gif) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/d4rl)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/d4rl_zh.html) |
+|  16  |       league_demo                      | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) ![selfplay](https://img.shields.io/badge/-selfplay-blue) | ![original](./dizoo/league_demo/league_demo.png) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/league_demo/envs)                |
+|  17  |       pomdp atari                    | ![discrete](https://img.shields.io/badge/-discrete-brightgreen)   |  | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/pomdp/envs) |
+|  18  |       [bsuite](https://github.com/deepmind/bsuite)                         | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | ![original](./dizoo/bsuite/bsuite.png) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/bsuite/envs)<br>[env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs//bsuite.html) <br> [环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/bsuite_zh.html) |
+|  19  | [ImageNet](https://www.image-net.org/) | ![IL](https://img.shields.io/badge/-IL/SL-purple) | ![original](./dizoo/image_classification/imagenet.png) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/image_classification)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/image_cls_zh.html) |
+|  20  | [slime_volleyball](https://github.com/hardmaru/slimevolleygym) | ![discrete](https://img.shields.io/badge/-discrete-brightgreen)![selfplay](https://img.shields.io/badge/-selfplay-blue) | ![ori](dizoo/slime_volley/slime_volley.gif) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/slime_volley)<br>[env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/slime_volleyball.html)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/slime_volleyball_zh.html) |
+|  21  | [gym_hybrid](https://github.com/thomashirtz/gym-hybrid) | ![hybrid](https://img.shields.io/badge/-hybrid-darkgreen) | ![ori](dizoo/gym_hybrid/moving_v0.gif) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/gym_hybrid)<br>[env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/gym_hybrid.html)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/gym_hybrid_zh.html) |
+|  22  | [GoBigger](https://github.com/opendilab/GoBigger) | ![hybrid](https://img.shields.io/badge/-hybrid-darkgreen)![marl](https://img.shields.io/badge/-MARL-yellow)![selfplay](https://img.shields.io/badge/-selfplay-blue) | ![ori](./dizoo/gobigger_overview.gif) | [dizoo link](https://github.com/opendilab/GoBigger-Challenge-2021/tree/main/di_baseline)<br>[env tutorial](https://gobigger.readthedocs.io/en/latest/index.html)<br>[环境指南](https://gobigger.readthedocs.io/zh_CN/latest/) |
+|  23  | [gym_soccer](https://github.com/openai/gym-soccer) | ![hybrid](https://img.shields.io/badge/-hybrid-darkgreen) | ![ori](dizoo/gym_soccer/half_offensive.gif) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/gym_soccer)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/gym_soccer_zh.html) |
+|  24  |[multiagent_mujoco](https://github.com/schroederdewitt/multiagent_mujoco)       |  ![continuous](https://img.shields.io/badge/-continous-green) ![marl](https://img.shields.io/badge/-MARL-yellow) | ![original](./dizoo/mujoco/mujoco.gif)                    | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/multiagent_mujoco/envs)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/mujoco_zh.html) |
+|  25  |bitflip                                | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) ![sparse](https://img.shields.io/badge/-sparse%20reward-orange)  | ![original](./dizoo/bitflip/bitflip.gif)    | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/bitflip/envs)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/bitflip_zh.html) |
+|  26  |[sokoban](https://github.com/mpSchrader/gym-sokoban) | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | ![Game 2](https://github.com/mpSchrader/gym-sokoban/raw/default/docs/Animations/solved_4.gif?raw=true) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/sokoban/envs)<br>[env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/sokoban.html)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/sokoban_zh.html) |
+|  27  |[gym_anytrading](https://github.com/AminHP/gym-anytrading) | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | ![original](./dizoo/gym_anytrading/envs/position.png) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/gym_anytrading) <br> [env tutorial](https://github.com/opendilab/DI-engine/blob/main/dizoo/gym_anytrading/envs/README.md) |
+|  28  |[mario](https://github.com/Kautenja/gym-super-mario-bros) | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | ![original](./dizoo/mario/mario.gif) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/mario) <br> [env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/gym_super_mario_bros.html) <br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/gym_super_mario_bros_zh.html) |
+|  29  |[dmc2gym](https://github.com/denisyarats/dmc2gym) | ![continuous](https://img.shields.io/badge/-continous-green) | ![original](./dizoo/dmc2gym/dmc2gym_cheetah.png) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/dmc2gym)<br>[env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/dmc2gym.html)<br>[环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/dmc2gym_zh.html) |
+|  30  |[evogym](https://github.com/EvolutionGym/evogym) | ![continuous](https://img.shields.io/badge/-continous-green) | ![original](./dizoo/evogym/evogym.gif) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/evogym/envs) <br> [env tutorial](https://di-engine-docs.readthedocs.io/en/latest/13_envs/evogym.html) <br> [环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/Evogym_zh.html) |
+|  31  |[gym-pybullet-drones](https://github.com/utiasDSL/gym-pybullet-drones) | ![continuous](https://img.shields.io/badge/-continous-green) | ![original](./dizoo/gym_pybullet_drones/gym_pybullet_drones.gif) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/gym_pybullet_drones/envs)<br>环境指南 |
+|  32  |[beergame](https://github.com/OptMLGroup/DeepBeerInventory-RL) | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | ![original](./dizoo/beergame/beergame.png) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/beergame/envs)<br>环境指南 |
+|  33  |[classic_control/acrobot](https://github.com/openai/gym/tree/master/gym/envs/classic_control) | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | ![original](./dizoo/classic_control/acrobot/acrobot.gif) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/classic_control/acrobot/envs)<br> [环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/acrobot_zh.html) |
+|  34  |[box2d/car_racing](https://github.com/openai/gym/blob/master/gym/envs/box2d/car_racing.py) | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) <br> ![continuous](https://img.shields.io/badge/-continous-green) | ![original](./dizoo/box2d/carracing/car_racing.gif) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/box2d/carracing/envs)<br>环境指南 |
+|  35  |[metadrive](https://github.com/metadriverse/metadrive) | ![continuous](https://img.shields.io/badge/-continous-green) | ![original](./dizoo/metadrive/metadrive_env.gif) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/metadrive/env)<br> [环境指南](https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/metadrive_zh.html) |
+|  36  |[cliffwalking](https://github.com/openai/gym/blob/master/gym/envs/toy_text/cliffwalking.py) | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | ![original](./dizoo/cliffwalking/cliff_walking.gif) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/cliffwalking/envs)<br> env tutorial <br> 环境指南 |
+|  37  | [tabmwp](https://promptpg.github.io/explore.html) | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) | ![original](./dizoo/tabmwp/tabmwp.jpeg) | [dizoo link](https://github.com/opendilab/DI-engine/tree/main/dizoo/tabmwp) <br> env tutorial <br> 环境指南|
+![discrete](https://img.shields.io/badge/-discrete-brightgreen) means discrete action space
+![continuous](https://img.shields.io/badge/-continous-green) means continuous action space
+![hybrid](https://img.shields.io/badge/-hybrid-darkgreen) means hybrid (discrete + continuous) action space
+![MARL](https://img.shields.io/badge/-MARL-yellow) means multi-agent RL environment
+![sparse](https://img.shields.io/badge/-sparse%20reward-orange) means environment which is related to exploration and sparse reward
+![offline](https://img.shields.io/badge/-offlineRL-darkblue) means offline RL environment
+![IL](https://img.shields.io/badge/-IL/SL-purple) means Imitation Learning or Supervised Learning Dataset
+![selfplay](https://img.shields.io/badge/-selfplay-blue) means environment that allows agent VS agent battle
+P.S. some enviroments in Atari, such as **MontezumaRevenge**, are also the sparse reward type.
+</details>
+### General Data Container: TreeTensor
+DI-engine utilizes [TreeTensor](https://github.com/opendilab/DI-treetensor) as the basic data container in various components, which is ease of use and consistent across different code modules such as environment definition, data processing and DRL optimization. Here are some concrete code examples:
+- TreeTensor can easily extend all the operations of `torch.Tensor` to nested data:
+  <details close>
+  <summary>(Click for Details)</summary>
+    ```python
+    import treetensor.torch as ttorch
+    # create random tensor
+    data = ttorch.randn({'a': (3, 2), 'b': {'c': (3, )}})
+    # clone+detach tensor
+    data_clone = data.clone().detach()
+    # access tree structure like attribute
+    a = data.a
+    c = data.b.c
+    # stack/cat/split
+    stacked_data = ttorch.stack([data, data_clone], 0)
+    cat_data = ttorch.cat([data, data_clone], 0)
+    data, data_clone = ttorch.split(stacked_data, 1)
+    # reshape
+    data = data.unsqueeze(-1)
+    data = data.squeeze(-1)
+    flatten_data = data.view(-1)
+    # indexing
+    data_0 = data[0]
+    data_1to2 = data[1:2]
+    # execute math calculations
+    data = data.sin()
+    data.b.c.cos_().clamp_(-1, 1)
+    data += data ** 2
+    # backward
+    data.requires_grad_(True)
+    loss = data.arctan().mean()
+    loss.backward()
+    # print shape
+    print(data.shape)
+    # result
+    # <Size 0x7fbd3346ddc0>
+    # ├── 'a' --> torch.Size([1, 3, 2])
+    # └── 'b' --> <Size 0x7fbd3346dd00>
+    #     └── 'c' --> torch.Size([1, 3])
+    ```
+  </details>
+- TreeTensor can make it simple yet effective to implement classic deep reinforcement learning pipeline
+  <details close>
+  <summary>(Click for Details)</summary>
+    ```diff
+    import torch
+    import treetensor.torch as ttorch
+    B = 4
+    def get_item():
+        return {
+            'obs': {
+                'scalar': torch.randn(12),
+                'image': torch.randn(3, 32, 32),
+            },
+            'action': torch.randint(0, 10, size=(1,)),
+            'reward': torch.rand(1),
+            'done': False,
+        }
+    data = [get_item() for _ in range(B)]
+    # execute `stack` op
+    - def stack(data, dim):
+    -     elem = data[0]
+    -     if isinstance(elem, torch.Tensor):
+    -         return torch.stack(data, dim)
+    -     elif isinstance(elem, dict):
+    -         return {k: stack([item[k] for item in data], dim) for k in elem.keys()}
+    -     elif isinstance(elem, bool):
+    -         return torch.BoolTensor(data)
+    -     else:
+    -         raise TypeError("not support elem type: {}".format(type(elem)))
+    - stacked_data = stack(data, dim=0)
+    + data = [ttorch.tensor(d) for d in data]
+    + stacked_data = ttorch.stack(data, dim=0)
+    # validate
+    - assert stacked_data['obs']['image'].shape == (B, 3, 32, 32)
+    - assert stacked_data['action'].shape == (B, 1)
+    - assert stacked_data['reward'].shape == (B, 1)
+    - assert stacked_data['done'].shape == (B,)
+    - assert stacked_data['done'].dtype == torch.bool
+    + assert stacked_data.obs.image.shape == (B, 3, 32, 32)
+    + assert stacked_data.action.shape == (B, 1)
+    + assert stacked_data.reward.shape == (B, 1)
+    + assert stacked_data.done.shape == (B,)
+    + assert stacked_data.done.dtype == torch.bool
+    ```
+  </details>
+## Feedback and Contribution
+- [File an issue](https://github.com/opendilab/DI-engine/issues/new/choose) on Github
+- Open or participate in our [forum](https://github.com/opendilab/DI-engine/discussions)
+- Discuss on DI-engine [slack communication channel](https://join.slack.com/t/opendilab/shared_invite/zt-v9tmv4fp-nUBAQEH1_Kuyu_q4plBssQ)
+- Discuss on DI-engine's WeChat group (i.e. add us on WeChat: ding314assist)
+  <img src=https://github.com/opendilab/DI-engine/blob/main/assets/wechat.jpeg width=35% />
+- Contact our email (opendilab@pjlab.org.cn)
+- Contributes to our future plan [Roadmap](https://github.com/opendilab/DI-engine/issues/548)
+We appreciate all the feedbacks and contributions to improve DI-engine, both algorithms and system designs. And `CONTRIBUTING.md` offers some necessary information.
+## Supporters
+### &#8627; Stargazers
+[![Stargazers repo roster for @opendilab/DI-engine](https://reporoster.com/stars/opendilab/DI-engine)](https://github.com/opendilab/DI-engine/stargazers)
+### &#8627; Forkers
+[![Forkers repo roster for @opendilab/DI-engine](https://reporoster.com/forks/opendilab/DI-engine)](https://github.com/opendilab/DI-engine/network/members)
+## Citation
+```latex
+@misc{ding,
+    title={DI-engine: OpenDILab Decision Intelligence Engine},
+    author={OpenDILab Contributors},
+    publisher={GitHub},
+    howpublished={\url{https://github.com/opendilab/DI-engine}},
+    year={2021},
+}
+```
+## License
+DI-engine released under the Apache 2.0 license.

DI-engine/cloc.sh ADDED Viewed

	@@ -0,0 +1,69 @@

+#!/bin/bash
+# This scripts counts the lines of code and comments in all source files
+# and prints the results to the command line. It uses the commandline tool
+# "cloc". You can either pass --loc, --comments or --percentage to show the
+# respective values only.
+# Some parts below need to be adapted to your project!
+# Get the location of this script.
+SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+# Run cloc - this counts code lines, blank lines and comment lines
+# for the specified languages. You will need to change this accordingly.
+# For C++, you could use "C++,C/C++ Header" for example.
+# We are only interested in the summary, therefore the tail -1
+SUMMARY="$(cloc "${SCRIPT_DIR}" --include-lang="Python" --md | tail -1)"
+# The $SUMMARY is one line of a markdown table and looks like this:
+# SUM:|101|3123|2238|10783
+# We use the following command to split it into an array.
+IFS='|' read -r -a TOKENS <<< "$SUMMARY"
+# Store the individual tokens for better readability.
+NUMBER_OF_FILES=${TOKENS[1]}
+COMMENT_LINES=${TOKENS[3]}
+LINES_OF_CODE=${TOKENS[4]}
+# To make the estimate of commented lines more accurate, we have to
+# subtract any copyright header which is included in each file.
+# For Fly-Pie, this header has the length of five lines.
+# All dumb comments like those /////////// or those // ------------
+# are also subtracted. As cloc does not count inline comments,
+# the overall estimate should be rather conservative.
+# Change the lines below according to your project.
+DUMB_COMMENTS="$(grep -r -E '//////|// -----' "${SCRIPT_DIR}" | wc -l)"
+COMMENT_LINES=$(($COMMENT_LINES - 5 * $NUMBER_OF_FILES - $DUMB_COMMENTS))
+# Print all results if no arguments are given.
+if [[ $# -eq 0 ]] ; then
+  awk -v a=$LINES_OF_CODE \
+      'BEGIN {printf "Lines of source code: %6.1fk\n", a/1000}'
+  awk -v a=$COMMENT_LINES \
+      'BEGIN {printf "Lines of comments:    %6.1fk\n", a/1000}'
+  awk -v a=$COMMENT_LINES -v b=$LINES_OF_CODE \
+      'BEGIN {printf "Comment Percentage:   %6.1f%\n", 100*a/b}'
+  exit 0
+fi
+# Show lines of code if --loc is given.
+if [[ $* == *--loc* ]]
+then
+  awk -v a=$LINES_OF_CODE \
+      'BEGIN {printf "%.1fk\n", a/1000}'
+fi
+# Show lines of comments if --comments is given.
+if [[ $* == *--comments* ]]
+then
+  awk -v a=$COMMENT_LINES \
+      'BEGIN {printf "%.1fk\n", a/1000}'
+fi
+# Show precentage of comments if --percentage is given.
+if [[ $* == *--percentage* ]]
+then
+  awk -v a=$COMMENT_LINES -v b=$LINES_OF_CODE \
+      'BEGIN {printf "%.1f\n", 100*a/b}'
+fi

DI-engine/codecov.yml ADDED Viewed

	@@ -0,0 +1,8 @@

+coverage:
+  status:
+    project:
+      default:
+        # basic
+        target: auto
+        threshold: 0.5%
+        if_ci_failed: success #success, failure, error, ignore

DI-engine/conda/conda_build_config.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ python:
2	+ - 3.7

DI-engine/conda/meta.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+{% set data = load_setup_py_data() %}
+package:
+  name: di-engine
+  version: v0.5.0
+source:
+  path: ..
+build:
+  number: 0
+  script: python -m pip install . -vv
+  entry_points:
+    - ding = ding.entry.cli:cli
+requirements:
+  build:
+    - python
+    - setuptools
+  run:
+    - python
+test:
+  imports:
+    - ding
+    - dizoo
+about:
+  home: https://github.com/opendilab/DI-engine
+  license: Apache-2.0
+  license_file: LICENSE
+  summary: DI-engine is a generalized Decision Intelligence engine (https://github.com/opendilab/DI-engine).
+  description: Please refer to https://di-engine-docs.readthedocs.io/en/latest/00_intro/index.html#what-is-di-engine
+  dev_url: https://github.com/opendilab/DI-engine
+  doc_url: https://di-engine-docs.readthedocs.io/en/latest/index.html
+  doc_source_url: https://github.com/opendilab/DI-engine-docs

DI-engine/ding/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import os
+__TITLE__ = 'DI-engine'
+__VERSION__ = 'v0.5.0'
+__DESCRIPTION__ = 'Decision AI Engine'
+__AUTHOR__ = "OpenDILab Contributors"
+__AUTHOR_EMAIL__ = "opendilab@pjlab.org.cn"
+__version__ = __VERSION__
+enable_hpc_rl = os.environ.get('ENABLE_DI_HPC', 'false').lower() == 'true'
+enable_linklink = os.environ.get('ENABLE_LINKLINK', 'false').lower() == 'true'
+enable_numba = True

DI-engine/ding/bonus/__init__.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import ding.config
+from .a2c import A2CAgent
+from .c51 import C51Agent
+from .ddpg import DDPGAgent
+from .dqn import DQNAgent
+from .pg import PGAgent
+from .ppof import PPOF
+from .ppo_offpolicy import PPOOffPolicyAgent
+from .sac import SACAgent
+from .sql import SQLAgent
+from .td3 import TD3Agent
+supported_algo = dict(
+    A2C=A2CAgent,
+    C51=C51Agent,
+    DDPG=DDPGAgent,
+    DQN=DQNAgent,
+    PG=PGAgent,
+    PPOF=PPOF,
+    PPOOffPolicy=PPOOffPolicyAgent,
+    SAC=SACAgent,
+    SQL=SQLAgent,
+    TD3=TD3Agent,
+)
+supported_algo_list = list(supported_algo.keys())
+def env_supported(algo: str = None) -> list:
+    """
+    return list of the envs that supported by di-engine.
+    """
+    if algo is not None:
+        if algo.upper() == "A2C":
+            return list(ding.config.example.A2C.supported_env.keys())
+        elif algo.upper() == "C51":
+            return list(ding.config.example.C51.supported_env.keys())
+        elif algo.upper() == "DDPG":
+            return list(ding.config.example.DDPG.supported_env.keys())
+        elif algo.upper() == "DQN":
+            return list(ding.config.example.DQN.supported_env.keys())
+        elif algo.upper() == "PG":
+            return list(ding.config.example.PG.supported_env.keys())
+        elif algo.upper() == "PPOF":
+            return list(ding.config.example.PPOF.supported_env.keys())
+        elif algo.upper() == "PPOOFFPOLICY":
+            return list(ding.config.example.PPOOffPolicy.supported_env.keys())
+        elif algo.upper() == "SAC":
+            return list(ding.config.example.SAC.supported_env.keys())
+        elif algo.upper() == "SQL":
+            return list(ding.config.example.SQL.supported_env.keys())
+        elif algo.upper() == "TD3":
+            return list(ding.config.example.TD3.supported_env.keys())
+        else:
+            raise ValueError("The algo {} is not supported by di-engine.".format(algo))
+    else:
+        supported_env = set()
+        supported_env.update(ding.config.example.A2C.supported_env.keys())
+        supported_env.update(ding.config.example.C51.supported_env.keys())
+        supported_env.update(ding.config.example.DDPG.supported_env.keys())
+        supported_env.update(ding.config.example.DQN.supported_env.keys())
+        supported_env.update(ding.config.example.PG.supported_env.keys())
+        supported_env.update(ding.config.example.PPOF.supported_env.keys())
+        supported_env.update(ding.config.example.PPOOffPolicy.supported_env.keys())
+        supported_env.update(ding.config.example.SAC.supported_env.keys())
+        supported_env.update(ding.config.example.SQL.supported_env.keys())
+        supported_env.update(ding.config.example.TD3.supported_env.keys())
+        # return the list of the envs
+        return list(supported_env)
+supported_env = env_supported()
+def algo_supported(env_id: str = None) -> list:
+    """
+    return list of the algos that supported by di-engine.
+    """
+    if env_id is not None:
+        algo = []
+        if env_id.upper() in [item.upper() for item in ding.config.example.A2C.supported_env.keys()]:
+            algo.append("A2C")
+        if env_id.upper() in [item.upper() for item in ding.config.example.C51.supported_env.keys()]:
+            algo.append("C51")
+        if env_id.upper() in [item.upper() for item in ding.config.example.DDPG.supported_env.keys()]:
+            algo.append("DDPG")
+        if env_id.upper() in [item.upper() for item in ding.config.example.DQN.supported_env.keys()]:
+            algo.append("DQN")
+        if env_id.upper() in [item.upper() for item in ding.config.example.PG.supported_env.keys()]:
+            algo.append("PG")
+        if env_id.upper() in [item.upper() for item in ding.config.example.PPOF.supported_env.keys()]:
+            algo.append("PPOF")
+        if env_id.upper() in [item.upper() for item in ding.config.example.PPOOffPolicy.supported_env.keys()]:
+            algo.append("PPOOffPolicy")
+        if env_id.upper() in [item.upper() for item in ding.config.example.SAC.supported_env.keys()]:
+            algo.append("SAC")
+        if env_id.upper() in [item.upper() for item in ding.config.example.SQL.supported_env.keys()]:
+            algo.append("SQL")
+        if env_id.upper() in [item.upper() for item in ding.config.example.TD3.supported_env.keys()]:
+            algo.append("TD3")
+        if len(algo) == 0:
+            raise ValueError("The env {} is not supported by di-engine.".format(env_id))
+        return algo
+    else:
+        return supported_algo_list
+def is_supported(env_id: str = None, algo: str = None) -> bool:
+    """
+    Check if the env-algo pair is supported by di-engine.
+    """
+    if env_id is not None and env_id.upper() in [item.upper() for item in supported_env.keys()]:
+        if algo is not None and algo.upper() in supported_algo_list:
+            if env_id.upper() in env_supported(algo):
+                return True
+            else:
+                return False
+        elif algo is None:
+            return True
+        else:
+            return False
+    elif env_id is None:
+        if algo is not None and algo.upper() in supported_algo_list:
+            return True
+        elif algo is None:
+            raise ValueError("Please specify the env or algo.")
+        else:
+            return False
+    else:
+        return False

DI-engine/ding/bonus/a2c.py ADDED Viewed

	@@ -0,0 +1,460 @@

+from typing import Optional, Union, List
+from ditk import logging
+from easydict import EasyDict
+import os
+import numpy as np
+import torch
+import treetensor.torch as ttorch
+from ding.framework import task, OnlineRLContext
+from ding.framework.middleware import CkptSaver, trainer, \
+    wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, \
+    gae_estimator, final_ctx_saver
+from ding.envs import BaseEnv
+from ding.envs import setup_ding_env_manager
+from ding.policy import A2CPolicy
+from ding.utils import set_pkg_seed
+from ding.utils import get_env_fps, render
+from ding.config import save_config_py, compile_config
+from ding.model import VAC
+from ding.model import model_wrap
+from ding.bonus.common import TrainingReturn, EvalReturn
+from ding.config.example.A2C import supported_env_cfg
+from ding.config.example.A2C import supported_env
+class A2CAgent:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \
+        Advantage Actor Critic(A2C).
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
+    supported_env_list = list(supported_env_cfg.keys())
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.a2c import A2CAgent
+        >>> print(A2CAgent.supported_env_list)
+    """
+    def __init__(
+            self,
+            env_id: str = None,
+            env: BaseEnv = None,
+            seed: int = 0,
+            exp_name: str = None,
+            model: Optional[torch.nn.Module] = None,
+            cfg: Optional[Union[EasyDict, dict]] = None,
+            policy_state_dict: str = None,
+    ) -> None:
+        """
+        Overview:
+            Initialize agent for A2C algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of A2C algorithm, which should be an instance of class \
+                :class:`ding.model.VAC`. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:Union[EasyDict, dict]): The configuration of A2C algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used. \
+                The default configuration can be found in ``ding/config/example/A2C/gym_lunarlander_v2.py``.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLanderContinuous-v2`` registered in gym, \
+            and we want to train an agent with A2C algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = A2CAgent(env_id='LunarLanderContinuous-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLanderContinuous-v2'}, 'policy': ...... }
+                >>> agent = A2CAgent(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLanderContinuous-v2')
+                >>> agent = A2CAgent(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = VAC(**cfg.policy.model)
+                >>> agent = A2CAgent(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = A2CAgent(cfg=cfg, policy_state_dict='LunarLanderContinuous-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+        assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
+        if cfg is not None and not isinstance(cfg, EasyDict):
+            cfg = EasyDict(cfg)
+        if env_id is not None:
+            assert env_id in A2CAgent.supported_env_list, "Please use supported envs: {}".format(
+                A2CAgent.supported_env_list
+            )
+            if cfg is None:
+                cfg = supported_env_cfg[env_id]
+            else:
+                assert cfg.env.env_id == env_id, "env_id in cfg should be the same as env_id in args."
+        else:
+            assert hasattr(cfg.env, "env_id"), "Please specify env_id in cfg."
+            assert cfg.env.env_id in A2CAgent.supported_env_list, "Please use supported envs: {}".format(
+                A2CAgent.supported_env_list
+            )
+        default_policy_config = EasyDict({"policy": A2CPolicy.default_config()})
+        default_policy_config.update(cfg)
+        cfg = default_policy_config
+        if exp_name is not None:
+            cfg.exp_name = exp_name
+        self.cfg = compile_config(cfg, policy=A2CPolicy)
+        self.exp_name = self.cfg.exp_name
+        if env is None:
+            self.env = supported_env[cfg.env.env_id](cfg=cfg.env)
+        else:
+            assert isinstance(env, BaseEnv), "Please use BaseEnv as env data type."
+            self.env = env
+        logging.getLogger().setLevel(logging.INFO)
+        self.seed = seed
+        set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda)
+        if not os.path.exists(self.exp_name):
+            os.makedirs(self.exp_name)
+        save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py'))
+        if model is None:
+            model = VAC(**self.cfg.policy.model)
+        self.policy = A2CPolicy(self.cfg.policy, model=model)
+        if policy_state_dict is not None:
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt")
+    def train(
+        self,
+        step: int = int(1e7),
+        collector_env_num: int = 4,
+        evaluator_env_num: int = 4,
+        n_iter_log_show: int = 500,
+        n_iter_save_ckpt: int = 1000,
+        context: Optional[str] = None,
+        debug: bool = False,
+        wandb_sweep: bool = False,
+    ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with A2C algorithm for ``step`` iterations with ``collector_env_num`` collector \
+            environments and ``evaluator_env_num`` evaluator environments. Information during training will be \
+            recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The collector environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        logging.debug(self.policy._model)
+        # define env and policy
+        collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector')
+        evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator')
+        with task.start(ctx=OnlineRLContext()):
+            task.use(
+                interaction_evaluator(
+                    self.cfg,
+                    self.policy.eval_mode,
+                    evaluator_env,
+                    render=self.cfg.policy.eval.render if hasattr(self.cfg.policy.eval, "render") else False
+                )
+            )
+            task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt))
+            task.use(
+                StepCollector(
+                    self.cfg,
+                    self.policy.collect_mode,
+                    collector_env,
+                    random_collect_size=self.cfg.policy.random_collect_size
+                    if hasattr(self.cfg.policy, 'random_collect_size') else 0,
+                )
+            )
+            task.use(gae_estimator(self.cfg, self.policy.collect_mode))
+            task.use(trainer(self.cfg, self.policy.learn_mode))
+            task.use(
+                wandb_online_logger(
+                    metric_list=self.policy._monitor_vars_learn(),
+                    model=self.policy._model,
+                    anonymous=True,
+                    project_name=self.exp_name,
+                    wandb_sweep=wandb_sweep,
+                )
+            )
+            task.use(termination_checker(max_env_step=step))
+            task.use(final_ctx_saver(name=self.exp_name))
+            task.run()
+        return TrainingReturn(wandb_url=task.ctx.wandb_url)
+    def deploy(
+            self,
+            enable_save_replay: bool = False,
+            concatenate_all_replay: bool = False,
+            replay_save_path: str = None,
+            seed: Optional[Union[int, List]] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with A2C algorithm by interacting with the environment, during which the replay video \
+            can be saved if ``enable_save_replay`` is True. The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env = self.env.clone(caller='evaluator')
+        if seed is not None and isinstance(seed, int):
+            seeds = [seed]
+        elif seed is not None and isinstance(seed, list):
+            seeds = seed
+        else:
+            seeds = [self.seed]
+        returns = []
+        images = []
+        if enable_save_replay:
+            replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path
+            env.enable_save_replay(replay_path=replay_save_path)
+        else:
+            logging.warning('No video would be generated during the deploy.')
+            if concatenate_all_replay:
+                logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.')
+                concatenate_all_replay = False
+        def single_env_forward_wrapper(forward_fn, cuda=True):
+            if self.cfg.policy.action_space == 'continuous':
+                forward_fn = model_wrap(forward_fn, wrapper_name='deterministic_sample').forward
+            elif self.cfg.policy.action_space == 'discrete':
+                forward_fn = model_wrap(forward_fn, wrapper_name='argmax_sample').forward
+            else:
+                raise NotImplementedError
+            def _forward(obs):
+                # unsqueeze means add batch dim, i.e. (O, ) -> (1, O)
+                obs = ttorch.as_tensor(obs).unsqueeze(0)
+                if cuda and torch.cuda.is_available():
+                    obs = obs.cuda()
+                action = forward_fn(obs, mode='compute_actor')["action"]
+                # squeeze means delete batch dim, i.e. (1, A) -> (A, )
+                action = action.squeeze(0).detach().cpu().numpy()
+                return action
+            return _forward
+        forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda)
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.reset()
+        for seed in seeds:
+            env.seed(seed, dynamic_seed=False)
+            return_ = 0.
+            step = 0
+            obs = env.reset()
+            images.append(render(env)[None]) if concatenate_all_replay else None
+            while True:
+                action = forward_fn(obs)
+                obs, rew, done, info = env.step(action)
+                images.append(render(env)[None]) if concatenate_all_replay else None
+                return_ += rew
+                step += 1
+                if done:
+                    break
+            logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}')
+            returns.append(return_)
+        env.close()
+        if concatenate_all_replay:
+            images = np.concatenate(images, axis=0)
+            import imageio
+            imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env))
+        return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns))
+    def collect_data(
+            self,
+            env_num: int = 8,
+            save_data_path: Optional[str] = None,
+            n_sample: Optional[int] = None,
+            n_episode: Optional[int] = None,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> None:
+        """
+        Overview:
+            Collect data with A2C algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        if n_episode is not None:
+            raise NotImplementedError
+        # define env and policy
+        env_num = env_num if env_num else self.cfg.env.collector_env_num
+        env = setup_ding_env_manager(self.env, env_num, context, debug, 'collector')
+        if save_data_path is None:
+            save_data_path = os.path.join(self.exp_name, 'demo_data')
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(
+                StepCollector(
+                    self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size
+                )
+            )
+            task.use(offline_data_saver(save_data_path, data_type='hdf5'))
+            task.run(max_step=1)
+        logging.info(
+            f'A2C collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`'
+        )
+    def batch_evaluate(
+            self,
+            env_num: int = 4,
+            n_evaluator_episode: int = 4,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with A2C algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \
+            environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env_num = env_num if env_num else self.cfg.env.evaluator_env_num
+        env = setup_ding_env_manager(self.env, env_num, context, debug, 'evaluator')
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.launch()
+        env.reset()
+        evaluate_cfg = self.cfg
+        evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env))
+            task.run(max_step=1)
+        return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
+    @property
+    def best(self) -> 'A2CAgent':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`A2CAgent`): The agent with the best model.
+        Examples:
+            >>> agent = A2CAgent(env_id='LunarLanderContinuous-v2')
+            >>> agent.train()
+            >>> agent = agent.best
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+        best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
+        # Load best model if it exists
+        if os.path.exists(best_model_file_path):
+            policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu"))
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        return self

DI-engine/ding/bonus/c51.py ADDED Viewed

	@@ -0,0 +1,459 @@

+from typing import Optional, Union, List
+from ditk import logging
+from easydict import EasyDict
+import os
+import numpy as np
+import torch
+import treetensor.torch as ttorch
+from ding.framework import task, OnlineRLContext
+from ding.framework.middleware import CkptSaver, \
+    wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \
+    OffPolicyLearner, final_ctx_saver, eps_greedy_handler, nstep_reward_enhancer
+from ding.envs import BaseEnv
+from ding.envs import setup_ding_env_manager
+from ding.policy import C51Policy
+from ding.utils import set_pkg_seed
+from ding.utils import get_env_fps, render
+from ding.config import save_config_py, compile_config
+from ding.model import C51DQN
+from ding.model import model_wrap
+from ding.data import DequeBuffer
+from ding.bonus.common import TrainingReturn, EvalReturn
+from ding.config.example.C51 import supported_env_cfg
+from ding.config.example.C51 import supported_env
+class C51Agent:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm C51.
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
+    supported_env_list = list(supported_env_cfg.keys())
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.c51 import C51Agent
+        >>> print(C51Agent.supported_env_list)
+    """
+    def __init__(
+            self,
+            env_id: str = None,
+            env: BaseEnv = None,
+            seed: int = 0,
+            exp_name: str = None,
+            model: Optional[torch.nn.Module] = None,
+            cfg: Optional[Union[EasyDict, dict]] = None,
+            policy_state_dict: str = None,
+    ) -> None:
+        """
+        Overview:
+            Initialize agent for C51 algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of C51 algorithm, which should be an instance of class \
+                :class:`ding.model.C51DQN`. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:Union[EasyDict, dict]): The configuration of C51 algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used. \
+                The default configuration can be found in ``ding/config/example/C51/gym_lunarlander_v2.py``.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLander-v2`` registered in gym, \
+            and we want to train an agent with C51 algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = C51Agent(env_id='LunarLander-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLander-v2'}, 'policy': ...... }
+                >>> agent = C51Agent(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLander-v2')
+                >>> agent = C51Agent(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = C51DQN(**cfg.policy.model)
+                >>> agent = C51Agent(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = C51Agent(cfg=cfg, policy_state_dict='LunarLander-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+        assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
+        if cfg is not None and not isinstance(cfg, EasyDict):
+            cfg = EasyDict(cfg)
+        if env_id is not None:
+            assert env_id in C51Agent.supported_env_list, "Please use supported envs: {}".format(
+                C51Agent.supported_env_list
+            )
+            if cfg is None:
+                cfg = supported_env_cfg[env_id]
+            else:
+                assert cfg.env.env_id == env_id, "env_id in cfg should be the same as env_id in args."
+        else:
+            assert hasattr(cfg.env, "env_id"), "Please specify env_id in cfg."
+            assert cfg.env.env_id in C51Agent.supported_env_list, "Please use supported envs: {}".format(
+                C51Agent.supported_env_list
+            )
+        default_policy_config = EasyDict({"policy": C51Policy.default_config()})
+        default_policy_config.update(cfg)
+        cfg = default_policy_config
+        if exp_name is not None:
+            cfg.exp_name = exp_name
+        self.cfg = compile_config(cfg, policy=C51Policy)
+        self.exp_name = self.cfg.exp_name
+        if env is None:
+            self.env = supported_env[cfg.env.env_id](cfg=cfg.env)
+        else:
+            assert isinstance(env, BaseEnv), "Please use BaseEnv as env data type."
+            self.env = env
+        logging.getLogger().setLevel(logging.INFO)
+        self.seed = seed
+        set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda)
+        if not os.path.exists(self.exp_name):
+            os.makedirs(self.exp_name)
+        save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py'))
+        if model is None:
+            model = C51DQN(**self.cfg.policy.model)
+        self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size)
+        self.policy = C51Policy(self.cfg.policy, model=model)
+        if policy_state_dict is not None:
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt")
+    def train(
+        self,
+        step: int = int(1e7),
+        collector_env_num: int = None,
+        evaluator_env_num: int = None,
+        n_iter_save_ckpt: int = 1000,
+        context: Optional[str] = None,
+        debug: bool = False,
+        wandb_sweep: bool = False,
+    ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with C51 algorithm for ``step`` iterations with ``collector_env_num`` collector \
+            environments and ``evaluator_env_num`` evaluator environments. Information during training will be \
+            recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The collector environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        logging.debug(self.policy._model)
+        # define env and policy
+        collector_env_num = collector_env_num if collector_env_num else self.cfg.env.collector_env_num
+        evaluator_env_num = evaluator_env_num if evaluator_env_num else self.cfg.env.evaluator_env_num
+        collector_env = setup_ding_env_manager(self.env, collector_env_num, context, debug, 'collector')
+        evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator')
+        with task.start(ctx=OnlineRLContext()):
+            task.use(
+                interaction_evaluator(
+                    self.cfg,
+                    self.policy.eval_mode,
+                    evaluator_env,
+                    render=self.cfg.policy.eval.render if hasattr(self.cfg.policy.eval, "render") else False
+                )
+            )
+            task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt))
+            task.use(eps_greedy_handler(self.cfg))
+            task.use(
+                StepCollector(
+                    self.cfg,
+                    self.policy.collect_mode,
+                    collector_env,
+                    random_collect_size=self.cfg.policy.random_collect_size
+                    if hasattr(self.cfg.policy, 'random_collect_size') else 0,
+                )
+            )
+            task.use(nstep_reward_enhancer(self.cfg))
+            task.use(data_pusher(self.cfg, self.buffer_))
+            task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_))
+            task.use(
+                wandb_online_logger(
+                    metric_list=self.policy._monitor_vars_learn(),
+                    model=self.policy._model,
+                    anonymous=True,
+                    project_name=self.exp_name,
+                    wandb_sweep=wandb_sweep,
+                )
+            )
+            task.use(termination_checker(max_env_step=step))
+            task.use(final_ctx_saver(name=self.exp_name))
+            task.run()
+        return TrainingReturn(wandb_url=task.ctx.wandb_url)
+    def deploy(
+            self,
+            enable_save_replay: bool = False,
+            concatenate_all_replay: bool = False,
+            replay_save_path: str = None,
+            seed: Optional[Union[int, List]] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with C51 algorithm by interacting with the environment, during which the replay video \
+            can be saved if ``enable_save_replay`` is True. The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env = self.env.clone(caller='evaluator')
+        if seed is not None and isinstance(seed, int):
+            seeds = [seed]
+        elif seed is not None and isinstance(seed, list):
+            seeds = seed
+        else:
+            seeds = [self.seed]
+        returns = []
+        images = []
+        if enable_save_replay:
+            replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path
+            env.enable_save_replay(replay_path=replay_save_path)
+        else:
+            logging.warning('No video would be generated during the deploy.')
+            if concatenate_all_replay:
+                logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.')
+                concatenate_all_replay = False
+        def single_env_forward_wrapper(forward_fn, cuda=True):
+            forward_fn = model_wrap(forward_fn, wrapper_name='argmax_sample').forward
+            def _forward(obs):
+                # unsqueeze means add batch dim, i.e. (O, ) -> (1, O)
+                obs = ttorch.as_tensor(obs).unsqueeze(0)
+                if cuda and torch.cuda.is_available():
+                    obs = obs.cuda()
+                action = forward_fn(obs)["action"]
+                # squeeze means delete batch dim, i.e. (1, A) -> (A, )
+                action = action.squeeze(0).detach().cpu().numpy()
+                return action
+            return _forward
+        forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda)
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.reset()
+        for seed in seeds:
+            env.seed(seed, dynamic_seed=False)
+            return_ = 0.
+            step = 0
+            obs = env.reset()
+            images.append(render(env)[None]) if concatenate_all_replay else None
+            while True:
+                action = forward_fn(obs)
+                obs, rew, done, info = env.step(action)
+                images.append(render(env)[None]) if concatenate_all_replay else None
+                return_ += rew
+                step += 1
+                if done:
+                    break
+            logging.info(f'C51 deploy is finished, final episode return with {step} steps is: {return_}')
+            returns.append(return_)
+        env.close()
+        if concatenate_all_replay:
+            images = np.concatenate(images, axis=0)
+            import imageio
+            imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env))
+        return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns))
+    def collect_data(
+            self,
+            env_num: int = 8,
+            save_data_path: Optional[str] = None,
+            n_sample: Optional[int] = None,
+            n_episode: Optional[int] = None,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> None:
+        """
+        Overview:
+            Collect data with C51 algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        if n_episode is not None:
+            raise NotImplementedError
+        # define env and policy
+        env_num = env_num if env_num else self.cfg.env.collector_env_num
+        env = setup_ding_env_manager(self.env, env_num, context, debug, 'collector')
+        if save_data_path is None:
+            save_data_path = os.path.join(self.exp_name, 'demo_data')
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(
+                StepCollector(
+                    self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size
+                )
+            )
+            task.use(offline_data_saver(save_data_path, data_type='hdf5'))
+            task.run(max_step=1)
+        logging.info(
+            f'C51 collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`'
+        )
+    def batch_evaluate(
+            self,
+            env_num: int = 4,
+            n_evaluator_episode: int = 4,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with C51 algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \
+            environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env_num = env_num if env_num else self.cfg.env.evaluator_env_num
+        env = setup_ding_env_manager(self.env, env_num, context, debug, 'evaluator')
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.launch()
+        env.reset()
+        evaluate_cfg = self.cfg
+        evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env))
+            task.run(max_step=1)
+        return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
+    @property
+    def best(self) -> 'C51Agent':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`C51Agent`): The agent with the best model.
+        Examples:
+            >>> agent = C51Agent(env_id='LunarLander-v2')
+            >>> agent.train()
+            >>> agent = agent.best
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+        best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
+        # Load best model if it exists
+        if os.path.exists(best_model_file_path):
+            policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu"))
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        return self

DI-engine/ding/bonus/common.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from dataclasses import dataclass
+import numpy as np
+@dataclass
+class TrainingReturn:
+    '''
+    Attributions
+    wandb_url: The weight & biases (wandb) project url of the trainning experiment.
+    '''
+    wandb_url: str
+@dataclass
+class EvalReturn:
+    '''
+    Attributions
+    eval_value: The mean of evaluation return.
+    eval_value_std: The standard deviation of evaluation return.
+    '''
+    eval_value: np.float32
+    eval_value_std: np.float32

DI-engine/ding/bonus/config.py ADDED Viewed

	@@ -0,0 +1,326 @@

+from easydict import EasyDict
+import os
+import gym
+from ding.envs import BaseEnv, DingEnvWrapper
+from ding.envs.env_wrappers import MaxAndSkipWrapper, WarpFrameWrapper, ScaledFloatFrameWrapper, FrameStackWrapper, \
+    EvalEpisodeReturnWrapper, TransposeWrapper, TimeLimitWrapper, FlatObsWrapper, GymToGymnasiumWrapper
+from ding.policy import PPOFPolicy
+def get_instance_config(env_id: str, algorithm: str) -> EasyDict:
+    if algorithm == 'PPOF':
+        cfg = PPOFPolicy.default_config()
+        if env_id == 'LunarLander-v2':
+            cfg.n_sample = 512
+            cfg.value_norm = 'popart'
+            cfg.entropy_weight = 1e-3
+        elif env_id == 'LunarLanderContinuous-v2':
+            cfg.action_space = 'continuous'
+            cfg.n_sample = 400
+        elif env_id == 'BipedalWalker-v3':
+            cfg.learning_rate = 1e-3
+            cfg.action_space = 'continuous'
+            cfg.n_sample = 1024
+        elif env_id == 'Pendulum-v1':
+            cfg.action_space = 'continuous'
+            cfg.n_sample = 400
+        elif env_id == 'acrobot':
+            cfg.learning_rate = 1e-4
+            cfg.n_sample = 400
+        elif env_id == 'rocket_landing':
+            cfg.n_sample = 2048
+            cfg.adv_norm = False
+            cfg.model = dict(
+                encoder_hidden_size_list=[64, 64, 128],
+                actor_head_hidden_size=128,
+                critic_head_hidden_size=128,
+            )
+        elif env_id == 'drone_fly':
+            cfg.action_space = 'continuous'
+            cfg.adv_norm = False
+            cfg.epoch_per_collect = 5
+            cfg.learning_rate = 5e-5
+            cfg.n_sample = 640
+        elif env_id == 'hybrid_moving':
+            cfg.action_space = 'hybrid'
+            cfg.n_sample = 3200
+            cfg.entropy_weight = 0.03
+            cfg.batch_size = 320
+            cfg.adv_norm = False
+            cfg.model = dict(
+                encoder_hidden_size_list=[256, 128, 64, 64],
+                sigma_type='fixed',
+                fixed_sigma_value=0.3,
+                bound_type='tanh',
+            )
+        elif env_id == 'evogym_carrier':
+            cfg.action_space = 'continuous'
+            cfg.n_sample = 2048
+            cfg.batch_size = 256
+            cfg.epoch_per_collect = 10
+            cfg.learning_rate = 3e-3
+        elif env_id == 'mario':
+            cfg.n_sample = 256
+            cfg.batch_size = 64
+            cfg.epoch_per_collect = 2
+            cfg.learning_rate = 1e-3
+            cfg.model = dict(
+                encoder_hidden_size_list=[64, 64, 128],
+                critic_head_hidden_size=128,
+                actor_head_hidden_size=128,
+            )
+        elif env_id == 'di_sheep':
+            cfg.n_sample = 3200
+            cfg.batch_size = 320
+            cfg.epoch_per_collect = 10
+            cfg.learning_rate = 3e-4
+            cfg.adv_norm = False
+            cfg.entropy_weight = 0.001
+        elif env_id == 'procgen_bigfish':
+            cfg.n_sample = 16384
+            cfg.batch_size = 16384
+            cfg.epoch_per_collect = 10
+            cfg.learning_rate = 5e-4
+            cfg.model = dict(
+                encoder_hidden_size_list=[64, 128, 256],
+                critic_head_hidden_size=256,
+                actor_head_hidden_size=256,
+            )
+        elif env_id in ['KangarooNoFrameskip-v4', 'BowlingNoFrameskip-v4']:
+            cfg.n_sample = 1024
+            cfg.batch_size = 128
+            cfg.epoch_per_collect = 10
+            cfg.learning_rate = 0.0001
+            cfg.model = dict(
+                encoder_hidden_size_list=[32, 64, 64, 128],
+                actor_head_hidden_size=128,
+                critic_head_hidden_size=128,
+                critic_head_layer_num=2,
+            )
+        elif env_id == 'PongNoFrameskip-v4':
+            cfg.n_sample = 3200
+            cfg.batch_size = 320
+            cfg.epoch_per_collect = 10
+            cfg.learning_rate = 3e-4
+            cfg.model = dict(
+                encoder_hidden_size_list=[64, 64, 128],
+                actor_head_hidden_size=128,
+                critic_head_hidden_size=128,
+            )
+        elif env_id == 'SpaceInvadersNoFrameskip-v4':
+            cfg.n_sample = 320
+            cfg.batch_size = 320
+            cfg.epoch_per_collect = 1
+            cfg.learning_rate = 1e-3
+            cfg.entropy_weight = 0.01
+            cfg.lr_scheduler = (2000, 0.1)
+            cfg.model = dict(
+                encoder_hidden_size_list=[64, 64, 128],
+                actor_head_hidden_size=128,
+                critic_head_hidden_size=128,
+            )
+        elif env_id == 'QbertNoFrameskip-v4':
+            cfg.n_sample = 3200
+            cfg.batch_size = 320
+            cfg.epoch_per_collect = 10
+            cfg.learning_rate = 5e-4
+            cfg.lr_scheduler = (1000, 0.1)
+            cfg.model = dict(
+                encoder_hidden_size_list=[64, 64, 128],
+                actor_head_hidden_size=128,
+                critic_head_hidden_size=128,
+            )
+        elif env_id == 'minigrid_fourroom':
+            cfg.n_sample = 3200
+            cfg.batch_size = 320
+            cfg.learning_rate = 3e-4
+            cfg.epoch_per_collect = 10
+            cfg.entropy_weight = 0.001
+        elif env_id == 'metadrive':
+            cfg.learning_rate = 3e-4
+            cfg.action_space = 'continuous'
+            cfg.entropy_weight = 0.001
+            cfg.n_sample = 3000
+            cfg.epoch_per_collect = 10
+            cfg.learning_rate = 0.0001
+            cfg.model = dict(
+                encoder_hidden_size_list=[32, 64, 64, 128],
+                actor_head_hidden_size=128,
+                critic_head_hidden_size=128,
+                critic_head_layer_num=2,
+            )
+        elif env_id == 'Hopper-v3':
+            cfg.action_space = "continuous"
+            cfg.n_sample = 3200
+            cfg.batch_size = 320
+            cfg.epoch_per_collect = 10
+            cfg.learning_rate = 3e-4
+        elif env_id == 'HalfCheetah-v3':
+            cfg.action_space = "continuous"
+            cfg.n_sample = 3200
+            cfg.batch_size = 320
+            cfg.epoch_per_collect = 10
+            cfg.learning_rate = 3e-4
+        elif env_id == 'Walker2d-v3':
+            cfg.action_space = "continuous"
+            cfg.n_sample = 3200
+            cfg.batch_size = 320
+            cfg.epoch_per_collect = 10
+            cfg.learning_rate = 3e-4
+        else:
+            raise KeyError("not supported env type: {}".format(env_id))
+    else:
+        raise KeyError("not supported algorithm type: {}".format(algorithm))
+    return cfg
+def get_instance_env(env_id: str) -> BaseEnv:
+    if env_id == 'LunarLander-v2':
+        return DingEnvWrapper(gym.make('LunarLander-v2'))
+    elif env_id == 'LunarLanderContinuous-v2':
+        return DingEnvWrapper(gym.make('LunarLanderContinuous-v2', continuous=True))
+    elif env_id == 'BipedalWalker-v3':
+        return DingEnvWrapper(gym.make('BipedalWalker-v3'), cfg={'act_scale': True, 'rew_clip': True})
+    elif env_id == 'Pendulum-v1':
+        return DingEnvWrapper(gym.make('Pendulum-v1'), cfg={'act_scale': True})
+    elif env_id == 'acrobot':
+        return DingEnvWrapper(gym.make('Acrobot-v1'))
+    elif env_id == 'rocket_landing':
+        from dizoo.rocket.envs import RocketEnv
+        cfg = EasyDict({
+            'task': 'landing',
+            'max_steps': 800,
+        })
+        return RocketEnv(cfg)
+    elif env_id == 'drone_fly':
+        from dizoo.gym_pybullet_drones.envs import GymPybulletDronesEnv
+        cfg = EasyDict({
+            'env_id': 'flythrugate-aviary-v0',
+            'action_type': 'VEL',
+        })
+        return GymPybulletDronesEnv(cfg)
+    elif env_id == 'hybrid_moving':
+        import gym_hybrid
+        return DingEnvWrapper(gym.make('Moving-v0'))
+    elif env_id == 'evogym_carrier':
+        import evogym.envs
+        from evogym import sample_robot, WorldObject
+        path = os.path.join(os.path.dirname(__file__), '../../dizoo/evogym/envs/world_data/carry_bot.json')
+        robot_object = WorldObject.from_json(path)
+        body = robot_object.get_structure()
+        return DingEnvWrapper(
+            gym.make('Carrier-v0', body=body),
+            cfg={
+                'env_wrapper': [
+                    lambda env: TimeLimitWrapper(env, max_limit=300),
+                    lambda env: EvalEpisodeReturnWrapper(env),
+                ]
+            }
+        )
+    elif env_id == 'mario':
+        import gym_super_mario_bros
+        from nes_py.wrappers import JoypadSpace
+        return DingEnvWrapper(
+            JoypadSpace(gym_super_mario_bros.make("SuperMarioBros-1-1-v1"), [["right"], ["right", "A"]]),
+            cfg={
+                'env_wrapper': [
+                    lambda env: MaxAndSkipWrapper(env, skip=4),
+                    lambda env: WarpFrameWrapper(env, size=84),
+                    lambda env: ScaledFloatFrameWrapper(env),
+                    lambda env: FrameStackWrapper(env, n_frames=4),
+                    lambda env: TimeLimitWrapper(env, max_limit=200),
+                    lambda env: EvalEpisodeReturnWrapper(env),
+                ]
+            }
+        )
+    elif env_id == 'di_sheep':
+        from sheep_env import SheepEnv
+        return DingEnvWrapper(SheepEnv(level=9))
+    elif env_id == 'procgen_bigfish':
+        return DingEnvWrapper(
+            gym.make('procgen:procgen-bigfish-v0', start_level=0, num_levels=1),
+            cfg={
+                'env_wrapper': [
+                    lambda env: TransposeWrapper(env),
+                    lambda env: ScaledFloatFrameWrapper(env),
+                    lambda env: EvalEpisodeReturnWrapper(env),
+                ]
+            },
+            seed_api=False,
+        )
+    elif env_id == 'Hopper-v3':
+        cfg = EasyDict(
+            env_id='Hopper-v3',
+            env_wrapper='mujoco_default',
+            act_scale=True,
+            rew_clip=True,
+        )
+        return DingEnvWrapper(gym.make('Hopper-v3'), cfg=cfg)
+    elif env_id == 'HalfCheetah-v3':
+        cfg = EasyDict(
+            env_id='HalfCheetah-v3',
+            env_wrapper='mujoco_default',
+            act_scale=True,
+            rew_clip=True,
+        )
+        return DingEnvWrapper(gym.make('HalfCheetah-v3'), cfg=cfg)
+    elif env_id == 'Walker2d-v3':
+        cfg = EasyDict(
+            env_id='Walker2d-v3',
+            env_wrapper='mujoco_default',
+            act_scale=True,
+            rew_clip=True,
+        )
+        return DingEnvWrapper(gym.make('Walker2d-v3'), cfg=cfg)
+    elif env_id in [
+            'BowlingNoFrameskip-v4',
+            'BreakoutNoFrameskip-v4',
+            'GopherNoFrameskip-v4'
+            'KangarooNoFrameskip-v4',
+            'PongNoFrameskip-v4',
+            'QbertNoFrameskip-v4',
+            'SpaceInvadersNoFrameskip-v4',
+    ]:
+        cfg = EasyDict({
+            'env_id': env_id,
+            'env_wrapper': 'atari_default',
+        })
+        ding_env_atari = DingEnvWrapper(gym.make(env_id), cfg=cfg)
+        return ding_env_atari
+    elif env_id == 'minigrid_fourroom':
+        import gymnasium
+        return DingEnvWrapper(
+            gymnasium.make('MiniGrid-FourRooms-v0'),
+            cfg={
+                'env_wrapper': [
+                    lambda env: GymToGymnasiumWrapper(env),
+                    lambda env: FlatObsWrapper(env),
+                    lambda env: TimeLimitWrapper(env, max_limit=300),
+                    lambda env: EvalEpisodeReturnWrapper(env),
+                ]
+            }
+        )
+    elif env_id == 'metadrive':
+        from dizoo.metadrive.env.drive_env import MetaDrivePPOOriginEnv
+        from dizoo.metadrive.env.drive_wrapper import DriveEnvWrapper
+        cfg = dict(
+            map='XSOS',
+            horizon=4000,
+            out_of_road_penalty=40.0,
+            crash_vehicle_penalty=40.0,
+            out_of_route_done=True,
+        )
+        cfg = EasyDict(cfg)
+        return DriveEnvWrapper(MetaDrivePPOOriginEnv(cfg))
+    else:
+        raise KeyError("not supported env type: {}".format(env_id))
+def get_hybrid_shape(action_space) -> EasyDict:
+    return EasyDict({
+        'action_type_shape': action_space[0].n,
+        'action_args_shape': action_space[1].shape,
+    })

DI-engine/ding/bonus/ddpg.py ADDED Viewed

	@@ -0,0 +1,456 @@

+from typing import Optional, Union, List
+from ditk import logging
+from easydict import EasyDict
+import os
+import numpy as np
+import torch
+import treetensor.torch as ttorch
+from ding.framework import task, OnlineRLContext
+from ding.framework.middleware import CkptSaver, \
+    wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \
+    OffPolicyLearner, final_ctx_saver
+from ding.envs import BaseEnv
+from ding.envs import setup_ding_env_manager
+from ding.policy import DDPGPolicy
+from ding.utils import set_pkg_seed
+from ding.utils import get_env_fps, render
+from ding.config import save_config_py, compile_config
+from ding.model import ContinuousQAC
+from ding.data import DequeBuffer
+from ding.bonus.common import TrainingReturn, EvalReturn
+from ding.config.example.DDPG import supported_env_cfg
+from ding.config.example.DDPG import supported_env
+class DDPGAgent:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \
+        Deep Deterministic Policy Gradient(DDPG).
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
+    supported_env_list = list(supported_env_cfg.keys())
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.ddpg import DDPGAgent
+        >>> print(DDPGAgent.supported_env_list)
+    """
+    def __init__(
+            self,
+            env_id: str = None,
+            env: BaseEnv = None,
+            seed: int = 0,
+            exp_name: str = None,
+            model: Optional[torch.nn.Module] = None,
+            cfg: Optional[Union[EasyDict, dict]] = None,
+            policy_state_dict: str = None,
+    ) -> None:
+        """
+        Overview:
+            Initialize agent for DDPG algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of DDPG algorithm, which should be an instance of class \
+                :class:`ding.model.ContinuousQAC`. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:Union[EasyDict, dict]): The configuration of DDPG algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used. \
+                The default configuration can be found in ``ding/config/example/DDPG/gym_lunarlander_v2.py``.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLanderContinuous-v2`` registered in gym, \
+            and we want to train an agent with DDPG algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = DDPGAgent(env_id='LunarLanderContinuous-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLanderContinuous-v2'}, 'policy': ...... }
+                >>> agent = DDPGAgent(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLanderContinuous-v2')
+                >>> agent = DDPGAgent(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = ContinuousQAC(**cfg.policy.model)
+                >>> agent = DDPGAgent(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = DDPGAgent(cfg=cfg, policy_state_dict='LunarLanderContinuous-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+        assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
+        if cfg is not None and not isinstance(cfg, EasyDict):
+            cfg = EasyDict(cfg)
+        if env_id is not None:
+            assert env_id in DDPGAgent.supported_env_list, "Please use supported envs: {}".format(
+                DDPGAgent.supported_env_list
+            )
+            if cfg is None:
+                cfg = supported_env_cfg[env_id]
+            else:
+                assert cfg.env.env_id == env_id, "env_id in cfg should be the same as env_id in args."
+        else:
+            assert hasattr(cfg.env, "env_id"), "Please specify env_id in cfg."
+            assert cfg.env.env_id in DDPGAgent.supported_env_list, "Please use supported envs: {}".format(
+                DDPGAgent.supported_env_list
+            )
+        default_policy_config = EasyDict({"policy": DDPGPolicy.default_config()})
+        default_policy_config.update(cfg)
+        cfg = default_policy_config
+        if exp_name is not None:
+            cfg.exp_name = exp_name
+        self.cfg = compile_config(cfg, policy=DDPGPolicy)
+        self.exp_name = self.cfg.exp_name
+        if env is None:
+            self.env = supported_env[cfg.env.env_id](cfg=cfg.env)
+        else:
+            assert isinstance(env, BaseEnv), "Please use BaseEnv as env data type."
+            self.env = env
+        logging.getLogger().setLevel(logging.INFO)
+        self.seed = seed
+        set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda)
+        if not os.path.exists(self.exp_name):
+            os.makedirs(self.exp_name)
+        save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py'))
+        if model is None:
+            model = ContinuousQAC(**self.cfg.policy.model)
+        self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size)
+        self.policy = DDPGPolicy(self.cfg.policy, model=model)
+        if policy_state_dict is not None:
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt")
+    def train(
+        self,
+        step: int = int(1e7),
+        collector_env_num: int = None,
+        evaluator_env_num: int = None,
+        n_iter_log_show: int = 500,
+        n_iter_save_ckpt: int = 1000,
+        context: Optional[str] = None,
+        debug: bool = False,
+        wandb_sweep: bool = False,
+    ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with DDPG algorithm for ``step`` iterations with ``collector_env_num`` collector \
+            environments and ``evaluator_env_num`` evaluator environments. Information during training will be \
+            recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The collector environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        logging.debug(self.policy._model)
+        # define env and policy
+        collector_env_num = collector_env_num if collector_env_num else self.cfg.env.collector_env_num
+        evaluator_env_num = evaluator_env_num if evaluator_env_num else self.cfg.env.evaluator_env_num
+        collector_env = setup_ding_env_manager(self.env, collector_env_num, context, debug, 'collector')
+        evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator')
+        with task.start(ctx=OnlineRLContext()):
+            task.use(
+                interaction_evaluator(
+                    self.cfg,
+                    self.policy.eval_mode,
+                    evaluator_env,
+                    render=self.cfg.policy.eval.render if hasattr(self.cfg.policy.eval, "render") else False
+                )
+            )
+            task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt))
+            task.use(
+                StepCollector(
+                    self.cfg,
+                    self.policy.collect_mode,
+                    collector_env,
+                    random_collect_size=self.cfg.policy.random_collect_size
+                    if hasattr(self.cfg.policy, 'random_collect_size') else 0,
+                )
+            )
+            task.use(data_pusher(self.cfg, self.buffer_))
+            task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_))
+            task.use(
+                wandb_online_logger(
+                    metric_list=self.policy._monitor_vars_learn(),
+                    model=self.policy._model,
+                    anonymous=True,
+                    project_name=self.exp_name,
+                    wandb_sweep=wandb_sweep,
+                )
+            )
+            task.use(termination_checker(max_env_step=step))
+            task.use(final_ctx_saver(name=self.exp_name))
+            task.run()
+        return TrainingReturn(wandb_url=task.ctx.wandb_url)
+    def deploy(
+            self,
+            enable_save_replay: bool = False,
+            concatenate_all_replay: bool = False,
+            replay_save_path: str = None,
+            seed: Optional[Union[int, List]] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with DDPG algorithm by interacting with the environment, during which the replay video \
+            can be saved if ``enable_save_replay`` is True. The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env = self.env.clone(caller='evaluator')
+        if seed is not None and isinstance(seed, int):
+            seeds = [seed]
+        elif seed is not None and isinstance(seed, list):
+            seeds = seed
+        else:
+            seeds = [self.seed]
+        returns = []
+        images = []
+        if enable_save_replay:
+            replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path
+            env.enable_save_replay(replay_path=replay_save_path)
+        else:
+            logging.warning('No video would be generated during the deploy.')
+            if concatenate_all_replay:
+                logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.')
+                concatenate_all_replay = False
+        def single_env_forward_wrapper(forward_fn, cuda=True):
+            def _forward(obs):
+                # unsqueeze means add batch dim, i.e. (O, ) -> (1, O)
+                obs = ttorch.as_tensor(obs).unsqueeze(0)
+                if cuda and torch.cuda.is_available():
+                    obs = obs.cuda()
+                action = forward_fn(obs, mode='compute_actor')["action"]
+                # squeeze means delete batch dim, i.e. (1, A) -> (A, )
+                action = action.squeeze(0).detach().cpu().numpy()
+                return action
+            return _forward
+        forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda)
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.reset()
+        for seed in seeds:
+            env.seed(seed, dynamic_seed=False)
+            return_ = 0.
+            step = 0
+            obs = env.reset()
+            images.append(render(env)[None]) if concatenate_all_replay else None
+            while True:
+                action = forward_fn(obs)
+                obs, rew, done, info = env.step(action)
+                images.append(render(env)[None]) if concatenate_all_replay else None
+                return_ += rew
+                step += 1
+                if done:
+                    break
+            logging.info(f'DDPG deploy is finished, final episode return with {step} steps is: {return_}')
+            returns.append(return_)
+        env.close()
+        if concatenate_all_replay:
+            images = np.concatenate(images, axis=0)
+            import imageio
+            imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env))
+        return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns))
+    def collect_data(
+            self,
+            env_num: int = 8,
+            save_data_path: Optional[str] = None,
+            n_sample: Optional[int] = None,
+            n_episode: Optional[int] = None,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> None:
+        """
+        Overview:
+            Collect data with DDPG algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        if n_episode is not None:
+            raise NotImplementedError
+        # define env and policy
+        env_num = env_num if env_num else self.cfg.env.collector_env_num
+        env = setup_ding_env_manager(self.env, env_num, context, debug, 'collector')
+        if save_data_path is None:
+            save_data_path = os.path.join(self.exp_name, 'demo_data')
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(
+                StepCollector(
+                    self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size
+                )
+            )
+            task.use(offline_data_saver(save_data_path, data_type='hdf5'))
+            task.run(max_step=1)
+        logging.info(
+            f'DDPG collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`'
+        )
+    def batch_evaluate(
+            self,
+            env_num: int = 4,
+            n_evaluator_episode: int = 4,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with DDPG algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \
+            environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env_num = env_num if env_num else self.cfg.env.evaluator_env_num
+        env = setup_ding_env_manager(self.env, env_num, context, debug, 'evaluator')
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.launch()
+        env.reset()
+        evaluate_cfg = self.cfg
+        evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env))
+            task.run(max_step=1)
+        return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
+    @property
+    def best(self) -> 'DDPGAgent':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`DDPGAgent`): The agent with the best model.
+        Examples:
+            >>> agent = DDPGAgent(env_id='LunarLanderContinuous-v2')
+            >>> agent.train()
+            >>> agent = agent.best
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+        best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
+        # Load best model if it exists
+        if os.path.exists(best_model_file_path):
+            policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu"))
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        return self

DI-engine/ding/bonus/dqn.py ADDED Viewed

	@@ -0,0 +1,460 @@

+from typing import Optional, Union, List
+from ditk import logging
+from easydict import EasyDict
+import os
+import numpy as np
+import torch
+import treetensor.torch as ttorch
+from ding.framework import task, OnlineRLContext
+from ding.framework.middleware import CkptSaver, \
+    wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \
+    OffPolicyLearner, final_ctx_saver, nstep_reward_enhancer, eps_greedy_handler
+from ding.envs import BaseEnv
+from ding.envs import setup_ding_env_manager
+from ding.policy import DQNPolicy
+from ding.utils import set_pkg_seed
+from ding.utils import get_env_fps, render
+from ding.config import save_config_py, compile_config
+from ding.model import DQN
+from ding.model import model_wrap
+from ding.data import DequeBuffer
+from ding.bonus.common import TrainingReturn, EvalReturn
+from ding.config.example.DQN import supported_env_cfg
+from ding.config.example.DQN import supported_env
+class DQNAgent:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm Deep Q-Learning(DQN).
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
+    supported_env_list = list(supported_env_cfg.keys())
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.dqn import DQNAgent
+        >>> print(DQNAgent.supported_env_list)
+    """
+    def __init__(
+            self,
+            env_id: str = None,
+            env: BaseEnv = None,
+            seed: int = 0,
+            exp_name: str = None,
+            model: Optional[torch.nn.Module] = None,
+            cfg: Optional[Union[EasyDict, dict]] = None,
+            policy_state_dict: str = None,
+    ) -> None:
+        """
+        Overview:
+            Initialize agent for DQN algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of DQN algorithm, which should be an instance of class \
+                :class:`ding.model.DQN`. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:Union[EasyDict, dict]): The configuration of DQN algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used. \
+                The default configuration can be found in ``ding/config/example/DQN/gym_lunarlander_v2.py``.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLander-v2`` registered in gym, \
+            and we want to train an agent with DQN algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = DQNAgent(env_id='LunarLander-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLander-v2'}, 'policy': ...... }
+                >>> agent = DQNAgent(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLander-v2')
+                >>> agent = DQNAgent(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = DQN(**cfg.policy.model)
+                >>> agent = DQNAgent(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = DQNAgent(cfg=cfg, policy_state_dict='LunarLander-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+        assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
+        if cfg is not None and not isinstance(cfg, EasyDict):
+            cfg = EasyDict(cfg)
+        if env_id is not None:
+            assert env_id in DQNAgent.supported_env_list, "Please use supported envs: {}".format(
+                DQNAgent.supported_env_list
+            )
+            if cfg is None:
+                cfg = supported_env_cfg[env_id]
+            else:
+                assert cfg.env.env_id == env_id, "env_id in cfg should be the same as env_id in args."
+        else:
+            assert hasattr(cfg.env, "env_id"), "Please specify env_id in cfg."
+            assert cfg.env.env_id in DQNAgent.supported_env_list, "Please use supported envs: {}".format(
+                DQNAgent.supported_env_list
+            )
+        default_policy_config = EasyDict({"policy": DQNPolicy.default_config()})
+        default_policy_config.update(cfg)
+        cfg = default_policy_config
+        if exp_name is not None:
+            cfg.exp_name = exp_name
+        self.cfg = compile_config(cfg, policy=DQNPolicy)
+        self.exp_name = self.cfg.exp_name
+        if env is None:
+            self.env = supported_env[cfg.env.env_id](cfg=cfg.env)
+        else:
+            assert isinstance(env, BaseEnv), "Please use BaseEnv as env data type."
+            self.env = env
+        logging.getLogger().setLevel(logging.INFO)
+        self.seed = seed
+        set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda)
+        if not os.path.exists(self.exp_name):
+            os.makedirs(self.exp_name)
+        save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py'))
+        if model is None:
+            model = DQN(**self.cfg.policy.model)
+        self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size)
+        self.policy = DQNPolicy(self.cfg.policy, model=model)
+        if policy_state_dict is not None:
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt")
+    def train(
+        self,
+        step: int = int(1e7),
+        collector_env_num: int = None,
+        evaluator_env_num: int = None,
+        n_iter_save_ckpt: int = 1000,
+        context: Optional[str] = None,
+        debug: bool = False,
+        wandb_sweep: bool = False,
+    ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with DQN algorithm for ``step`` iterations with ``collector_env_num`` collector \
+            environments and ``evaluator_env_num`` evaluator environments. Information during training will be \
+            recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The collector environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        logging.debug(self.policy._model)
+        # define env and policy
+        collector_env_num = collector_env_num if collector_env_num else self.cfg.env.collector_env_num
+        evaluator_env_num = evaluator_env_num if evaluator_env_num else self.cfg.env.evaluator_env_num
+        collector_env = setup_ding_env_manager(self.env, collector_env_num, context, debug, 'collector')
+        evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator')
+        with task.start(ctx=OnlineRLContext()):
+            task.use(
+                interaction_evaluator(
+                    self.cfg,
+                    self.policy.eval_mode,
+                    evaluator_env,
+                    render=self.cfg.policy.eval.render if hasattr(self.cfg.policy.eval, "render") else False
+                )
+            )
+            task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt))
+            task.use(eps_greedy_handler(self.cfg))
+            task.use(
+                StepCollector(
+                    self.cfg,
+                    self.policy.collect_mode,
+                    collector_env,
+                    random_collect_size=self.cfg.policy.random_collect_size
+                    if hasattr(self.cfg.policy, 'random_collect_size') else 0,
+                )
+            )
+            if "nstep" in self.cfg.policy and self.cfg.policy.nstep > 1:
+                task.use(nstep_reward_enhancer(self.cfg))
+            task.use(data_pusher(self.cfg, self.buffer_))
+            task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_))
+            task.use(
+                wandb_online_logger(
+                    metric_list=self.policy._monitor_vars_learn(),
+                    model=self.policy._model,
+                    anonymous=True,
+                    project_name=self.exp_name,
+                    wandb_sweep=wandb_sweep,
+                )
+            )
+            task.use(termination_checker(max_env_step=step))
+            task.use(final_ctx_saver(name=self.exp_name))
+            task.run()
+        return TrainingReturn(wandb_url=task.ctx.wandb_url)
+    def deploy(
+            self,
+            enable_save_replay: bool = False,
+            concatenate_all_replay: bool = False,
+            replay_save_path: str = None,
+            seed: Optional[Union[int, List]] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with DQN algorithm by interacting with the environment, during which the replay video \
+            can be saved if ``enable_save_replay`` is True. The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env = self.env.clone(caller='evaluator')
+        if seed is not None and isinstance(seed, int):
+            seeds = [seed]
+        elif seed is not None and isinstance(seed, list):
+            seeds = seed
+        else:
+            seeds = [self.seed]
+        returns = []
+        images = []
+        if enable_save_replay:
+            replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path
+            env.enable_save_replay(replay_path=replay_save_path)
+        else:
+            logging.warning('No video would be generated during the deploy.')
+            if concatenate_all_replay:
+                logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.')
+                concatenate_all_replay = False
+        def single_env_forward_wrapper(forward_fn, cuda=True):
+            forward_fn = model_wrap(forward_fn, wrapper_name='argmax_sample').forward
+            def _forward(obs):
+                # unsqueeze means add batch dim, i.e. (O, ) -> (1, O)
+                obs = ttorch.as_tensor(obs).unsqueeze(0)
+                if cuda and torch.cuda.is_available():
+                    obs = obs.cuda()
+                action = forward_fn(obs)["action"]
+                # squeeze means delete batch dim, i.e. (1, A) -> (A, )
+                action = action.squeeze(0).detach().cpu().numpy()
+                return action
+            return _forward
+        forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda)
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.reset()
+        for seed in seeds:
+            env.seed(seed, dynamic_seed=False)
+            return_ = 0.
+            step = 0
+            obs = env.reset()
+            images.append(render(env)[None]) if concatenate_all_replay else None
+            while True:
+                action = forward_fn(obs)
+                obs, rew, done, info = env.step(action)
+                images.append(render(env)[None]) if concatenate_all_replay else None
+                return_ += rew
+                step += 1
+                if done:
+                    break
+            logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}')
+            returns.append(return_)
+        env.close()
+        if concatenate_all_replay:
+            images = np.concatenate(images, axis=0)
+            import imageio
+            imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env))
+        return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns))
+    def collect_data(
+            self,
+            env_num: int = 8,
+            save_data_path: Optional[str] = None,
+            n_sample: Optional[int] = None,
+            n_episode: Optional[int] = None,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> None:
+        """
+        Overview:
+            Collect data with DQN algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        if n_episode is not None:
+            raise NotImplementedError
+        # define env and policy
+        env_num = env_num if env_num else self.cfg.env.collector_env_num
+        env = setup_ding_env_manager(self.env, env_num, context, debug, 'collector')
+        if save_data_path is None:
+            save_data_path = os.path.join(self.exp_name, 'demo_data')
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(
+                StepCollector(
+                    self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size
+                )
+            )
+            task.use(offline_data_saver(save_data_path, data_type='hdf5'))
+            task.run(max_step=1)
+        logging.info(
+            f'DQN collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`'
+        )
+    def batch_evaluate(
+            self,
+            env_num: int = 4,
+            n_evaluator_episode: int = 4,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with DQN algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \
+            environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env_num = env_num if env_num else self.cfg.env.evaluator_env_num
+        env = setup_ding_env_manager(self.env, env_num, context, debug, 'evaluator')
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.launch()
+        env.reset()
+        evaluate_cfg = self.cfg
+        evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env))
+            task.run(max_step=1)
+        return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
+    @property
+    def best(self) -> 'DQNAgent':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`DQNAgent`): The agent with the best model.
+        Examples:
+            >>> agent = DQNAgent(env_id='LunarLander-v2')
+            >>> agent.train()
+            >>> agent = agent.best
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+        best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
+        # Load best model if it exists
+        if os.path.exists(best_model_file_path):
+            policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu"))
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        return self

DI-engine/ding/bonus/model.py ADDED Viewed

	@@ -0,0 +1,245 @@

+from typing import Union, Optional
+from easydict import EasyDict
+import torch
+import torch.nn as nn
+import treetensor.torch as ttorch
+from copy import deepcopy
+from ding.utils import SequenceType, squeeze
+from ding.model.common import ReparameterizationHead, RegressionHead, MultiHead, \
+    FCEncoder, ConvEncoder, IMPALAConvEncoder, PopArtVHead
+from ding.torch_utils import MLP, fc_block
+class DiscretePolicyHead(nn.Module):
+    def __init__(
+            self,
+            hidden_size: int,
+            output_size: int,
+            layer_num: int = 1,
+            activation: Optional[nn.Module] = nn.ReLU(),
+            norm_type: Optional[str] = None,
+    ) -> None:
+        super(DiscretePolicyHead, self).__init__()
+        self.main = nn.Sequential(
+            MLP(
+                hidden_size,
+                hidden_size,
+                hidden_size,
+                layer_num,
+                layer_fn=nn.Linear,
+                activation=activation,
+                norm_type=norm_type
+            ), fc_block(hidden_size, output_size)
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.main(x)
+class PPOFModel(nn.Module):
+    mode = ['compute_actor', 'compute_critic', 'compute_actor_critic']
+    def __init__(
+        self,
+        obs_shape: Union[int, SequenceType],
+        action_shape: Union[int, SequenceType, EasyDict],
+        action_space: str = 'discrete',
+        share_encoder: bool = True,
+        encoder_hidden_size_list: SequenceType = [128, 128, 64],
+        actor_head_hidden_size: int = 64,
+        actor_head_layer_num: int = 1,
+        critic_head_hidden_size: int = 64,
+        critic_head_layer_num: int = 1,
+        activation: Optional[nn.Module] = nn.ReLU(),
+        norm_type: Optional[str] = None,
+        sigma_type: Optional[str] = 'independent',
+        fixed_sigma_value: Optional[int] = 0.3,
+        bound_type: Optional[str] = None,
+        encoder: Optional[torch.nn.Module] = None,
+        popart_head=False,
+    ) -> None:
+        super(PPOFModel, self).__init__()
+        obs_shape = squeeze(obs_shape)
+        action_shape = squeeze(action_shape)
+        self.obs_shape, self.action_shape = obs_shape, action_shape
+        self.share_encoder = share_encoder
+        # Encoder Type
+        def new_encoder(outsize):
+            if isinstance(obs_shape, int) or len(obs_shape) == 1:
+                return FCEncoder(
+                    obs_shape=obs_shape,
+                    hidden_size_list=encoder_hidden_size_list,
+                    activation=activation,
+                    norm_type=norm_type
+                )
+            elif len(obs_shape) == 3:
+                return ConvEncoder(
+                    obs_shape=obs_shape,
+                    hidden_size_list=encoder_hidden_size_list,
+                    activation=activation,
+                    norm_type=norm_type
+                )
+            else:
+                raise RuntimeError(
+                    "not support obs_shape for pre-defined encoder: {}, please customize your own encoder".
+                    format(obs_shape)
+                )
+        if self.share_encoder:
+            assert actor_head_hidden_size == critic_head_hidden_size, \
+                "actor and critic network head should have same size."
+            if encoder:
+                if isinstance(encoder, torch.nn.Module):
+                    self.encoder = encoder
+                else:
+                    raise ValueError("illegal encoder instance.")
+            else:
+                self.encoder = new_encoder(actor_head_hidden_size)
+        else:
+            if encoder:
+                if isinstance(encoder, torch.nn.Module):
+                    self.actor_encoder = encoder
+                    self.critic_encoder = deepcopy(encoder)
+                else:
+                    raise ValueError("illegal encoder instance.")
+            else:
+                self.actor_encoder = new_encoder(actor_head_hidden_size)
+                self.critic_encoder = new_encoder(critic_head_hidden_size)
+        # Head Type
+        if not popart_head:
+            self.critic_head = RegressionHead(
+                critic_head_hidden_size, 1, critic_head_layer_num, activation=activation, norm_type=norm_type
+            )
+        else:
+            self.critic_head = PopArtVHead(
+                critic_head_hidden_size, 1, critic_head_layer_num, activation=activation, norm_type=norm_type
+            )
+        self.action_space = action_space
+        assert self.action_space in ['discrete', 'continuous', 'hybrid'], self.action_space
+        if self.action_space == 'continuous':
+            self.multi_head = False
+            self.actor_head = ReparameterizationHead(
+                actor_head_hidden_size,
+                action_shape,
+                actor_head_layer_num,
+                sigma_type=sigma_type,
+                activation=activation,
+                norm_type=norm_type,
+                bound_type=bound_type
+            )
+        elif self.action_space == 'discrete':
+            actor_head_cls = DiscretePolicyHead
+            multi_head = not isinstance(action_shape, int)
+            self.multi_head = multi_head
+            if multi_head:
+                self.actor_head = MultiHead(
+                    actor_head_cls,
+                    actor_head_hidden_size,
+                    action_shape,
+                    layer_num=actor_head_layer_num,
+                    activation=activation,
+                    norm_type=norm_type
+                )
+            else:
+                self.actor_head = actor_head_cls(
+                    actor_head_hidden_size,
+                    action_shape,
+                    actor_head_layer_num,
+                    activation=activation,
+                    norm_type=norm_type
+                )
+        elif self.action_space == 'hybrid':  # HPPO
+            # hybrid action space: action_type(discrete) + action_args(continuous),
+            # such as {'action_type_shape': torch.LongTensor([0]), 'action_args_shape': torch.FloatTensor([0.1, -0.27])}
+            action_shape.action_args_shape = squeeze(action_shape.action_args_shape)
+            action_shape.action_type_shape = squeeze(action_shape.action_type_shape)
+            actor_action_args = ReparameterizationHead(
+                actor_head_hidden_size,
+                action_shape.action_args_shape,
+                actor_head_layer_num,
+                sigma_type=sigma_type,
+                fixed_sigma_value=fixed_sigma_value,
+                activation=activation,
+                norm_type=norm_type,
+                bound_type=bound_type,
+            )
+            actor_action_type = DiscretePolicyHead(
+                actor_head_hidden_size,
+                action_shape.action_type_shape,
+                actor_head_layer_num,
+                activation=activation,
+                norm_type=norm_type,
+            )
+            self.actor_head = nn.ModuleList([actor_action_type, actor_action_args])
+        # must use list, not nn.ModuleList
+        if self.share_encoder:
+            self.actor = [self.encoder, self.actor_head]
+            self.critic = [self.encoder, self.critic_head]
+        else:
+            self.actor = [self.actor_encoder, self.actor_head]
+            self.critic = [self.critic_encoder, self.critic_head]
+        # Convenient for calling some apis (e.g. self.critic.parameters()),
+        # but may cause misunderstanding when `print(self)`
+        self.actor = nn.ModuleList(self.actor)
+        self.critic = nn.ModuleList(self.critic)
+    def forward(self, inputs: ttorch.Tensor, mode: str) -> ttorch.Tensor:
+        assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode)
+        return getattr(self, mode)(inputs)
+    def compute_actor(self, x: ttorch.Tensor) -> ttorch.Tensor:
+        if self.share_encoder:
+            x = self.encoder(x)
+        else:
+            x = self.actor_encoder(x)
+        if self.action_space == 'discrete':
+            return self.actor_head(x)
+        elif self.action_space == 'continuous':
+            x = self.actor_head(x)  # mu, sigma
+            return ttorch.as_tensor(x)
+        elif self.action_space == 'hybrid':
+            action_type = self.actor_head[0](x)
+            action_args = self.actor_head[1](x)
+            return ttorch.as_tensor({'action_type': action_type, 'action_args': action_args})
+    def compute_critic(self, x: ttorch.Tensor) -> ttorch.Tensor:
+        if self.share_encoder:
+            x = self.encoder(x)
+        else:
+            x = self.critic_encoder(x)
+        x = self.critic_head(x)
+        return x
+    def compute_actor_critic(self, x: ttorch.Tensor) -> ttorch.Tensor:
+        if self.share_encoder:
+            actor_embedding = critic_embedding = self.encoder(x)
+        else:
+            actor_embedding = self.actor_encoder(x)
+            critic_embedding = self.critic_encoder(x)
+        value = self.critic_head(critic_embedding)
+        if self.action_space == 'discrete':
+            logit = self.actor_head(actor_embedding)
+            return ttorch.as_tensor({'logit': logit, 'value': value['pred']})
+        elif self.action_space == 'continuous':
+            x = self.actor_head(actor_embedding)
+            return ttorch.as_tensor({'logit': x, 'value': value['pred']})
+        elif self.action_space == 'hybrid':
+            action_type = self.actor_head[0](actor_embedding)
+            action_args = self.actor_head[1](actor_embedding)
+            return ttorch.as_tensor(
+                {
+                    'logit': {
+                        'action_type': action_type,
+                        'action_args': action_args
+                    },
+                    'value': value['pred']
+                }
+            )

DI-engine/ding/bonus/pg.py ADDED Viewed

	@@ -0,0 +1,453 @@

+from typing import Optional, Union, List
+from ditk import logging
+from easydict import EasyDict
+import os
+import numpy as np
+import torch
+import treetensor.torch as ttorch
+from ding.framework import task, OnlineRLContext
+from ding.framework.middleware import CkptSaver, trainer, \
+    wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, \
+    montecarlo_return_estimator, final_ctx_saver, EpisodeCollector
+from ding.envs import BaseEnv
+from ding.envs import setup_ding_env_manager
+from ding.policy import PGPolicy
+from ding.utils import set_pkg_seed
+from ding.utils import get_env_fps, render
+from ding.config import save_config_py, compile_config
+from ding.model import PG
+from ding.bonus.common import TrainingReturn, EvalReturn
+from ding.config.example.PG import supported_env_cfg
+from ding.config.example.PG import supported_env
+class PGAgent:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm Policy Gradient(PG).
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
+    supported_env_list = list(supported_env_cfg.keys())
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.pg import PGAgent
+        >>> print(PGAgent.supported_env_list)
+    """
+    def __init__(
+            self,
+            env_id: str = None,
+            env: BaseEnv = None,
+            seed: int = 0,
+            exp_name: str = None,
+            model: Optional[torch.nn.Module] = None,
+            cfg: Optional[Union[EasyDict, dict]] = None,
+            policy_state_dict: str = None,
+    ) -> None:
+        """
+        Overview:
+            Initialize agent for PG algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of PG algorithm, which should be an instance of class \
+                :class:`ding.model.PG`. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:Union[EasyDict, dict]): The configuration of PG algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used. \
+                The default configuration can be found in ``ding/config/example/PG/gym_lunarlander_v2.py``.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLanderContinuous-v2`` registered in gym, \
+            and we want to train an agent with PG algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = PGAgent(env_id='LunarLanderContinuous-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLanderContinuous-v2'}, 'policy': ...... }
+                >>> agent = PGAgent(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLanderContinuous-v2')
+                >>> agent = PGAgent(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = PG(**cfg.policy.model)
+                >>> agent = PGAgent(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = PGAgent(cfg=cfg, policy_state_dict='LunarLanderContinuous-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+        assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
+        if cfg is not None and not isinstance(cfg, EasyDict):
+            cfg = EasyDict(cfg)
+        if env_id is not None:
+            assert env_id in PGAgent.supported_env_list, "Please use supported envs: {}".format(
+                PGAgent.supported_env_list
+            )
+            if cfg is None:
+                cfg = supported_env_cfg[env_id]
+            else:
+                assert cfg.env.env_id == env_id, "env_id in cfg should be the same as env_id in args."
+        else:
+            assert hasattr(cfg.env, "env_id"), "Please specify env_id in cfg."
+            assert cfg.env.env_id in PGAgent.supported_env_list, "Please use supported envs: {}".format(
+                PGAgent.supported_env_list
+            )
+        default_policy_config = EasyDict({"policy": PGPolicy.default_config()})
+        default_policy_config.update(cfg)
+        cfg = default_policy_config
+        if exp_name is not None:
+            cfg.exp_name = exp_name
+        self.cfg = compile_config(cfg, policy=PGPolicy)
+        self.exp_name = self.cfg.exp_name
+        if env is None:
+            self.env = supported_env[cfg.env.env_id](cfg=cfg.env)
+        else:
+            assert isinstance(env, BaseEnv), "Please use BaseEnv as env data type."
+            self.env = env
+        logging.getLogger().setLevel(logging.INFO)
+        self.seed = seed
+        set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda)
+        if not os.path.exists(self.exp_name):
+            os.makedirs(self.exp_name)
+        save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py'))
+        if model is None:
+            model = PG(**self.cfg.policy.model)
+        self.policy = PGPolicy(self.cfg.policy, model=model)
+        if policy_state_dict is not None:
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt")
+    def train(
+        self,
+        step: int = int(1e7),
+        collector_env_num: int = None,
+        evaluator_env_num: int = None,
+        n_iter_save_ckpt: int = 1000,
+        context: Optional[str] = None,
+        debug: bool = False,
+        wandb_sweep: bool = False,
+    ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with PG algorithm for ``step`` iterations with ``collector_env_num`` collector \
+            environments and ``evaluator_env_num`` evaluator environments. Information during training will be \
+            recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The collector environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        logging.debug(self.policy._model)
+        # define env and policy
+        collector_env_num = collector_env_num if collector_env_num else self.cfg.env.collector_env_num
+        evaluator_env_num = evaluator_env_num if evaluator_env_num else self.cfg.env.evaluator_env_num
+        collector_env = setup_ding_env_manager(self.env, collector_env_num, context, debug, 'collector')
+        evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator')
+        with task.start(ctx=OnlineRLContext()):
+            task.use(
+                interaction_evaluator(
+                    self.cfg,
+                    self.policy.eval_mode,
+                    evaluator_env,
+                    render=self.cfg.policy.eval.render if hasattr(self.cfg.policy.eval, "render") else False
+                )
+            )
+            task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt))
+            task.use(EpisodeCollector(self.cfg, self.policy.collect_mode, collector_env))
+            task.use(montecarlo_return_estimator(self.policy))
+            task.use(trainer(self.cfg, self.policy.learn_mode))
+            task.use(
+                wandb_online_logger(
+                    metric_list=self.policy._monitor_vars_learn(),
+                    model=self.policy._model,
+                    anonymous=True,
+                    project_name=self.exp_name,
+                    wandb_sweep=wandb_sweep,
+                )
+            )
+            task.use(termination_checker(max_env_step=step))
+            task.use(final_ctx_saver(name=self.exp_name))
+            task.run()
+        return TrainingReturn(wandb_url=task.ctx.wandb_url)
+    def deploy(
+            self,
+            enable_save_replay: bool = False,
+            concatenate_all_replay: bool = False,
+            replay_save_path: str = None,
+            seed: Optional[Union[int, List]] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with PG algorithm by interacting with the environment, during which the replay video \
+            can be saved if ``enable_save_replay`` is True. The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env = self.env.clone(caller='evaluator')
+        if seed is not None and isinstance(seed, int):
+            seeds = [seed]
+        elif seed is not None and isinstance(seed, list):
+            seeds = seed
+        else:
+            seeds = [self.seed]
+        returns = []
+        images = []
+        if enable_save_replay:
+            replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path
+            env.enable_save_replay(replay_path=replay_save_path)
+        else:
+            logging.warning('No video would be generated during the deploy.')
+            if concatenate_all_replay:
+                logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.')
+                concatenate_all_replay = False
+        def single_env_forward_wrapper(forward_fn, cuda=True):
+            def _forward(obs):
+                # unsqueeze means add batch dim, i.e. (O, ) -> (1, O)
+                obs = ttorch.as_tensor(obs).unsqueeze(0)
+                if cuda and torch.cuda.is_available():
+                    obs = obs.cuda()
+                output = forward_fn(obs)
+                if self.policy._cfg.deterministic_eval:
+                    if self.policy._cfg.action_space == 'discrete':
+                        output['action'] = output['logit'].argmax(dim=-1)
+                    elif self.policy._cfg.action_space == 'continuous':
+                        output['action'] = output['logit']['mu']
+                    else:
+                        raise KeyError("invalid action_space: {}".format(self.policy._cfg.action_space))
+                else:
+                    output['action'] = output['dist'].sample()
+                # squeeze means delete batch dim, i.e. (1, A) -> (A, )
+                action = output['action'].squeeze(0).detach().cpu().numpy()
+                return action
+            return _forward
+        forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda)
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.reset()
+        for seed in seeds:
+            env.seed(seed, dynamic_seed=False)
+            return_ = 0.
+            step = 0
+            obs = env.reset()
+            images.append(render(env)[None]) if concatenate_all_replay else None
+            while True:
+                action = forward_fn(obs)
+                obs, rew, done, info = env.step(action)
+                images.append(render(env)[None]) if concatenate_all_replay else None
+                return_ += rew
+                step += 1
+                if done:
+                    break
+            logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}')
+            returns.append(return_)
+        env.close()
+        if concatenate_all_replay:
+            images = np.concatenate(images, axis=0)
+            import imageio
+            imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env))
+        return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns))
+    def collect_data(
+            self,
+            env_num: int = 8,
+            save_data_path: Optional[str] = None,
+            n_sample: Optional[int] = None,
+            n_episode: Optional[int] = None,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> None:
+        """
+        Overview:
+            Collect data with PG algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        if n_episode is not None:
+            raise NotImplementedError
+        # define env and policy
+        env_num = env_num if env_num else self.cfg.env.collector_env_num
+        env = setup_ding_env_manager(self.env, env_num, context, debug, 'collector')
+        if save_data_path is None:
+            save_data_path = os.path.join(self.exp_name, 'demo_data')
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(
+                StepCollector(
+                    self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size
+                )
+            )
+            task.use(offline_data_saver(save_data_path, data_type='hdf5'))
+            task.run(max_step=1)
+        logging.info(
+            f'PG collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`'
+        )
+    def batch_evaluate(
+            self,
+            env_num: int = 4,
+            n_evaluator_episode: int = 4,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with PG algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \
+            environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env_num = env_num if env_num else self.cfg.env.evaluator_env_num
+        env = setup_ding_env_manager(self.env, env_num, context, debug, 'evaluator')
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.launch()
+        env.reset()
+        evaluate_cfg = self.cfg
+        evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env))
+            task.run(max_step=1)
+        return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
+    @property
+    def best(self) -> 'PGAgent':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`PGAgent`): The agent with the best model.
+        Examples:
+            >>> agent = PGAgent(env_id='LunarLanderContinuous-v2')
+            >>> agent.train()
+            >>> agent = agent.best
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+        best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
+        # Load best model if it exists
+        if os.path.exists(best_model_file_path):
+            policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu"))
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        return self

DI-engine/ding/bonus/ppo_offpolicy.py ADDED Viewed

	@@ -0,0 +1,471 @@

+from typing import Optional, Union, List
+from ditk import logging
+from easydict import EasyDict
+import os
+import numpy as np
+import torch
+import treetensor.torch as ttorch
+from ding.framework import task, OnlineRLContext
+from ding.framework.middleware import CkptSaver, final_ctx_saver, OffPolicyLearner, StepCollector, \
+    wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, gae_estimator
+from ding.envs import BaseEnv
+from ding.envs import setup_ding_env_manager
+from ding.policy import PPOOffPolicy
+from ding.utils import set_pkg_seed
+from ding.utils import get_env_fps, render
+from ding.config import save_config_py, compile_config
+from ding.model import VAC
+from ding.model import model_wrap
+from ding.data import DequeBuffer
+from ding.bonus.common import TrainingReturn, EvalReturn
+from ding.config.example.PPOOffPolicy import supported_env_cfg
+from ding.config.example.PPOOffPolicy import supported_env
+class PPOOffPolicyAgent:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \
+        Proximal Policy Optimization(PPO) in an off-policy style.
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
+    supported_env_list = list(supported_env_cfg.keys())
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.ppo_offpolicy import PPOOffPolicyAgent
+        >>> print(PPOOffPolicyAgent.supported_env_list)
+    """
+    def __init__(
+            self,
+            env_id: str = None,
+            env: BaseEnv = None,
+            seed: int = 0,
+            exp_name: str = None,
+            model: Optional[torch.nn.Module] = None,
+            cfg: Optional[Union[EasyDict, dict]] = None,
+            policy_state_dict: str = None,
+    ) -> None:
+        """
+        Overview:
+            Initialize agent for PPO (offpolicy) algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of PPO (offpolicy) algorithm, \
+                which should be an instance of class :class:`ding.model.VAC`. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:Union[EasyDict, dict]): The configuration of PPO (offpolicy) algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used. \
+                The default configuration can be found in ``ding/config/example/PPO (offpolicy)/gym_lunarlander_v2.py``.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLander-v2`` registered in gym, \
+            and we want to train an agent with PPO (offpolicy) algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = PPOOffPolicyAgent(env_id='LunarLander-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLander-v2'}, 'policy': ...... }
+                >>> agent = PPOOffPolicyAgent(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLander-v2')
+                >>> agent = PPOOffPolicyAgent(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = VAC(**cfg.policy.model)
+                >>> agent = PPOOffPolicyAgent(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = PPOOffPolicyAgent(cfg=cfg, policy_state_dict='LunarLander-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+        assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
+        if cfg is not None and not isinstance(cfg, EasyDict):
+            cfg = EasyDict(cfg)
+        if env_id is not None:
+            assert env_id in PPOOffPolicyAgent.supported_env_list, "Please use supported envs: {}".format(
+                PPOOffPolicyAgent.supported_env_list
+            )
+            if cfg is None:
+                cfg = supported_env_cfg[env_id]
+            else:
+                assert cfg.env.env_id == env_id, "env_id in cfg should be the same as env_id in args."
+        else:
+            assert hasattr(cfg.env, "env_id"), "Please specify env_id in cfg."
+            assert cfg.env.env_id in PPOOffPolicyAgent.supported_env_list, "Please use supported envs: {}".format(
+                PPOOffPolicyAgent.supported_env_list
+            )
+        default_policy_config = EasyDict({"policy": PPOOffPolicy.default_config()})
+        default_policy_config.update(cfg)
+        cfg = default_policy_config
+        if exp_name is not None:
+            cfg.exp_name = exp_name
+        self.cfg = compile_config(cfg, policy=PPOOffPolicy)
+        self.exp_name = self.cfg.exp_name
+        if env is None:
+            self.env = supported_env[cfg.env.env_id](cfg=cfg.env)
+        else:
+            assert isinstance(env, BaseEnv), "Please use BaseEnv as env data type."
+            self.env = env
+        logging.getLogger().setLevel(logging.INFO)
+        self.seed = seed
+        set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda)
+        if not os.path.exists(self.exp_name):
+            os.makedirs(self.exp_name)
+        save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py'))
+        if model is None:
+            model = VAC(**self.cfg.policy.model)
+        self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size)
+        self.policy = PPOOffPolicy(self.cfg.policy, model=model)
+        if policy_state_dict is not None:
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt")
+    def train(
+        self,
+        step: int = int(1e7),
+        collector_env_num: int = None,
+        evaluator_env_num: int = None,
+        n_iter_save_ckpt: int = 1000,
+        context: Optional[str] = None,
+        debug: bool = False,
+        wandb_sweep: bool = False,
+    ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with PPO (offpolicy) algorithm for ``step`` iterations with ``collector_env_num`` \
+            collector environments and ``evaluator_env_num`` evaluator environments. \
+            Information during training will be recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The collector environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        logging.debug(self.policy._model)
+        # define env and policy
+        collector_env_num = collector_env_num if collector_env_num else self.cfg.env.collector_env_num
+        evaluator_env_num = evaluator_env_num if evaluator_env_num else self.cfg.env.evaluator_env_num
+        collector_env = setup_ding_env_manager(self.env, collector_env_num, context, debug, 'collector')
+        evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator')
+        with task.start(ctx=OnlineRLContext()):
+            task.use(
+                interaction_evaluator(
+                    self.cfg,
+                    self.policy.eval_mode,
+                    evaluator_env,
+                    render=self.cfg.policy.eval.render if hasattr(self.cfg.policy.eval, "render") else False
+                )
+            )
+            task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt))
+            task.use(
+                StepCollector(
+                    self.cfg,
+                    self.policy.collect_mode,
+                    collector_env,
+                    random_collect_size=self.cfg.policy.random_collect_size
+                    if hasattr(self.cfg.policy, 'random_collect_size') else 0,
+                )
+            )
+            task.use(gae_estimator(self.cfg, self.policy.collect_mode, self.buffer_))
+            task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_))
+            task.use(
+                wandb_online_logger(
+                    cfg=self.cfg.wandb_logger,
+                    exp_config=self.cfg,
+                    metric_list=self.policy._monitor_vars_learn(),
+                    model=self.policy._model,
+                    anonymous=True,
+                    project_name=self.exp_name,
+                    wandb_sweep=wandb_sweep,
+                )
+            )
+            task.use(termination_checker(max_env_step=step))
+            task.use(final_ctx_saver(name=self.exp_name))
+            task.run()
+        return TrainingReturn(wandb_url=task.ctx.wandb_url)
+    def deploy(
+            self,
+            enable_save_replay: bool = False,
+            concatenate_all_replay: bool = False,
+            replay_save_path: str = None,
+            seed: Optional[Union[int, List]] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with PPO (offpolicy) algorithm by interacting with the environment, \
+            during which the replay video can be saved if ``enable_save_replay`` is True. \
+            The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env = self.env.clone(caller='evaluator')
+        if seed is not None and isinstance(seed, int):
+            seeds = [seed]
+        elif seed is not None and isinstance(seed, list):
+            seeds = seed
+        else:
+            seeds = [self.seed]
+        returns = []
+        images = []
+        if enable_save_replay:
+            replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path
+            env.enable_save_replay(replay_path=replay_save_path)
+        else:
+            logging.warning('No video would be generated during the deploy.')
+            if concatenate_all_replay:
+                logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.')
+                concatenate_all_replay = False
+        def single_env_forward_wrapper(forward_fn, cuda=True):
+            if self.cfg.policy.action_space == 'discrete':
+                forward_fn = model_wrap(forward_fn, wrapper_name='argmax_sample').forward
+            elif self.cfg.policy.action_space == 'continuous':
+                forward_fn = model_wrap(forward_fn, wrapper_name='deterministic_sample').forward
+            elif self.cfg.policy.action_space == 'hybrid':
+                forward_fn = model_wrap(forward_fn, wrapper_name='hybrid_deterministic_argmax_sample').forward
+            elif self.cfg.policy.action_space == 'general':
+                forward_fn = model_wrap(forward_fn, wrapper_name='base').forward
+            else:
+                raise NotImplementedError
+            def _forward(obs):
+                # unsqueeze means add batch dim, i.e. (O, ) -> (1, O)
+                obs = ttorch.as_tensor(obs).unsqueeze(0)
+                if cuda and torch.cuda.is_available():
+                    obs = obs.cuda()
+                action = forward_fn(obs, mode='compute_actor')["action"]
+                # squeeze means delete batch dim, i.e. (1, A) -> (A, )
+                action = action.squeeze(0).detach().cpu().numpy()
+                return action
+            return _forward
+        forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda)
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.reset()
+        for seed in seeds:
+            env.seed(seed, dynamic_seed=False)
+            return_ = 0.
+            step = 0
+            obs = env.reset()
+            images.append(render(env)[None]) if concatenate_all_replay else None
+            while True:
+                action = forward_fn(obs)
+                obs, rew, done, info = env.step(action)
+                images.append(render(env)[None]) if concatenate_all_replay else None
+                return_ += rew
+                step += 1
+                if done:
+                    break
+            logging.info(f'PPO (offpolicy) deploy is finished, final episode return with {step} steps is: {return_}')
+            returns.append(return_)
+        env.close()
+        if concatenate_all_replay:
+            images = np.concatenate(images, axis=0)
+            import imageio
+            imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env))
+        return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns))
+    def collect_data(
+            self,
+            env_num: int = 8,
+            save_data_path: Optional[str] = None,
+            n_sample: Optional[int] = None,
+            n_episode: Optional[int] = None,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> None:
+        """
+        Overview:
+            Collect data with PPO (offpolicy) algorithm for ``n_episode`` episodes \
+            with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        if n_episode is not None:
+            raise NotImplementedError
+        # define env and policy
+        env_num = env_num if env_num else self.cfg.env.collector_env_num
+        env = setup_ding_env_manager(self.env, env_num, context, debug, 'collector')
+        if save_data_path is None:
+            save_data_path = os.path.join(self.exp_name, 'demo_data')
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(
+                StepCollector(
+                    self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size
+                )
+            )
+            task.use(offline_data_saver(save_data_path, data_type='hdf5'))
+            task.run(max_step=1)
+        logging.info(
+            f'PPOOffPolicy collecting is finished, more than {n_sample} \
+                samples are collected and saved in `{save_data_path}`'
+        )
+    def batch_evaluate(
+            self,
+            env_num: int = 4,
+            n_evaluator_episode: int = 4,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with PPO (offpolicy) algorithm for ``n_evaluator_episode`` episodes \
+            with ``env_num`` evaluator environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env_num = env_num if env_num else self.cfg.env.evaluator_env_num
+        env = setup_ding_env_manager(self.env, env_num, context, debug, 'evaluator')
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.launch()
+        env.reset()
+        evaluate_cfg = self.cfg
+        evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env))
+            task.run(max_step=1)
+        return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
+    @property
+    def best(self) -> 'PPOOffPolicyAgent':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`PPOOffPolicyAgent`): The agent with the best model.
+        Examples:
+            >>> agent = PPOOffPolicyAgent(env_id='LunarLander-v2')
+            >>> agent.train()
+            >>> agent.best
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+        best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
+        # Load best model if it exists
+        if os.path.exists(best_model_file_path):
+            policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu"))
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        return self

DI-engine/ding/bonus/ppof.py ADDED Viewed

	@@ -0,0 +1,509 @@

+from typing import Optional, Union, List
+from ditk import logging
+from easydict import EasyDict
+from functools import partial
+import os
+import gym
+import gymnasium
+import numpy as np
+import torch
+from ding.framework import task, OnlineRLContext
+from ding.framework.middleware import interaction_evaluator_ttorch, PPOFStepCollector, multistep_trainer, CkptSaver, \
+    wandb_online_logger, offline_data_saver, termination_checker, ppof_adv_estimator
+from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2
+from ding.policy import PPOFPolicy, single_env_forward_wrapper_ttorch
+from ding.utils import set_pkg_seed
+from ding.utils import get_env_fps, render
+from ding.config import save_config_py
+from .model import PPOFModel
+from .config import get_instance_config, get_instance_env, get_hybrid_shape
+from ding.bonus.common import TrainingReturn, EvalReturn
+class PPOF:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \
+        Proximal Policy Optimization(PPO).
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
+    supported_env_list = [
+        # common
+        'LunarLander-v2',
+        'LunarLanderContinuous-v2',
+        'BipedalWalker-v3',
+        'Pendulum-v1',
+        'acrobot',
+        # ch2: action
+        'rocket_landing',
+        'drone_fly',
+        'hybrid_moving',
+        # ch3: obs
+        'evogym_carrier',
+        'mario',
+        'di_sheep',
+        'procgen_bigfish',
+        # ch4: reward
+        'minigrid_fourroom',
+        'metadrive',
+        # atari
+        'BowlingNoFrameskip-v4',
+        'BreakoutNoFrameskip-v4',
+        'GopherNoFrameskip-v4'
+        'KangarooNoFrameskip-v4',
+        'PongNoFrameskip-v4',
+        'QbertNoFrameskip-v4',
+        'SpaceInvadersNoFrameskip-v4',
+        # mujoco
+        'Hopper-v3',
+        'HalfCheetah-v3',
+        'Walker2d-v3',
+    ]
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.ppof import PPOF
+        >>> print(PPOF.supported_env_list)
+    """
+    def __init__(
+            self,
+            env_id: str = None,
+            env: BaseEnv = None,
+            seed: int = 0,
+            exp_name: str = None,
+            model: Optional[torch.nn.Module] = None,
+            cfg: Optional[Union[EasyDict, dict]] = None,
+            policy_state_dict: str = None
+    ) -> None:
+        """
+        Overview:
+            Initialize agent for PPO algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``PPOF.supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, ``env_id`` or ``cfg.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of PPO algorithm, which should be an instance of class \
+                ``ding.model.PPOFModel``. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:`Union[EasyDict, dict]`): The configuration of PPO algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLander-v2`` registered in gym, \
+            and we want to train an agent with PPO algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = PPOF(env_id='LunarLander-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLander-v2'}, 'policy': ...... }
+                >>> agent = PPOF(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLander-v2')
+                >>> agent = PPOF(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = VAC(**cfg.policy.model)
+                >>> agent = PPOF(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = PPOF(cfg=cfg, policy_state_dict='LunarLander-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+        assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
+        if cfg is not None and not isinstance(cfg, EasyDict):
+            cfg = EasyDict(cfg)
+        if env_id is not None:
+            assert env_id in PPOF.supported_env_list, "Please use supported envs: {}".format(PPOF.supported_env_list)
+            if cfg is None:
+                cfg = get_instance_config(env_id, algorithm="PPOF")
+            if not hasattr(cfg, "env_id"):
+                cfg.env_id = env_id
+            assert cfg.env_id == env_id, "env_id in cfg should be the same as env_id in args."
+        else:
+            assert hasattr(cfg, "env_id"), "Please specify env_id in cfg."
+            assert cfg.env_id in PPOF.supported_env_list, "Please use supported envs: {}".format(
+                PPOF.supported_env_list
+            )
+        if exp_name is not None:
+            cfg.exp_name = exp_name
+        elif not hasattr(cfg, "exp_name"):
+            cfg.exp_name = "{}-{}".format(cfg.env_id, "PPO")
+        self.cfg = cfg
+        self.exp_name = self.cfg.exp_name
+        if env is None:
+            self.env = get_instance_env(self.cfg.env_id)
+        else:
+            self.env = env
+        logging.getLogger().setLevel(logging.INFO)
+        self.seed = seed
+        set_pkg_seed(self.seed, use_cuda=self.cfg.cuda)
+        if not os.path.exists(self.exp_name):
+            os.makedirs(self.exp_name)
+        save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py'))
+        action_space = self.env.action_space
+        if isinstance(action_space, (gym.spaces.Discrete, gymnasium.spaces.Discrete)):
+            action_shape = int(action_space.n)
+        elif isinstance(action_space, (gym.spaces.Tuple, gymnasium.spaces.Tuple)):
+            action_shape = get_hybrid_shape(action_space)
+        else:
+            action_shape = action_space.shape
+        # Three types of value normalization is supported currently
+        assert self.cfg.value_norm in ['popart', 'value_rescale', 'symlog', 'baseline']
+        if model is None:
+            if self.cfg.value_norm != 'popart':
+                model = PPOFModel(
+                    self.env.observation_space.shape,
+                    action_shape,
+                    action_space=self.cfg.action_space,
+                    **self.cfg.model
+                )
+            else:
+                model = PPOFModel(
+                    self.env.observation_space.shape,
+                    action_shape,
+                    action_space=self.cfg.action_space,
+                    popart_head=True,
+                    **self.cfg.model
+                )
+        self.policy = PPOFPolicy(self.cfg, model=model)
+        if policy_state_dict is not None:
+            self.policy.load_state_dict(policy_state_dict)
+        self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt")
+    def train(
+        self,
+        step: int = int(1e7),
+        collector_env_num: int = 4,
+        evaluator_env_num: int = 4,
+        n_iter_log_show: int = 500,
+        n_iter_save_ckpt: int = 1000,
+        context: Optional[str] = None,
+        reward_model: Optional[str] = None,
+        debug: bool = False,
+        wandb_sweep: bool = False,
+    ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with PPO algorithm for ``step`` iterations with ``collector_env_num`` collector \
+            environments and ``evaluator_env_num`` evaluator environments. Information during training will be \
+            recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The number of collector environments. Default to 4.
+            - evaluator_env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_iter_log_show (:obj:`int`): The frequency of logging every training iteration. Default to 500.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - reward_model (:obj:`str`): The reward model name. Default to None. This argument is not supported yet.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        logging.debug(self.policy._model)
+        # define env and policy
+        collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector')
+        evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator')
+        if reward_model is not None:
+            # self.reward_model = create_reward_model(reward_model, self.cfg.reward_model)
+            pass
+        with task.start(ctx=OnlineRLContext()):
+            task.use(interaction_evaluator_ttorch(self.seed, self.policy, evaluator_env))
+            task.use(CkptSaver(self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt))
+            task.use(PPOFStepCollector(self.seed, self.policy, collector_env, self.cfg.n_sample))
+            task.use(ppof_adv_estimator(self.policy))
+            task.use(multistep_trainer(self.policy, log_freq=n_iter_log_show))
+            task.use(
+                wandb_online_logger(
+                    metric_list=self.policy.monitor_vars(),
+                    model=self.policy._model,
+                    anonymous=True,
+                    project_name=self.exp_name,
+                    wandb_sweep=wandb_sweep,
+                )
+            )
+            task.use(termination_checker(max_env_step=step))
+            task.run()
+        return TrainingReturn(wandb_url=task.ctx.wandb_url)
+    def deploy(
+            self,
+            enable_save_replay: bool = False,
+            concatenate_all_replay: bool = False,
+            replay_save_path: str = None,
+            seed: Optional[Union[int, List]] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with PPO algorithm by interacting with the environment, during which the replay video \
+            can be saved if ``enable_save_replay`` is True. The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env = self.env.clone(caller='evaluator')
+        if seed is not None and isinstance(seed, int):
+            seeds = [seed]
+        elif seed is not None and isinstance(seed, list):
+            seeds = seed
+        else:
+            seeds = [self.seed]
+        returns = []
+        images = []
+        if enable_save_replay:
+            replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path
+            env.enable_save_replay(replay_path=replay_save_path)
+        else:
+            logging.warning('No video would be generated during the deploy.')
+            if concatenate_all_replay:
+                logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.')
+                concatenate_all_replay = False
+        forward_fn = single_env_forward_wrapper_ttorch(self.policy.eval, self.cfg.cuda)
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.reset()
+        for seed in seeds:
+            env.seed(seed, dynamic_seed=False)
+            return_ = 0.
+            step = 0
+            obs = env.reset()
+            images.append(render(env)[None]) if concatenate_all_replay else None
+            while True:
+                action = forward_fn(obs)
+                obs, rew, done, info = env.step(action)
+                images.append(render(env)[None]) if concatenate_all_replay else None
+                return_ += rew
+                step += 1
+                if done:
+                    break
+            logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}')
+            returns.append(return_)
+        env.close()
+        if concatenate_all_replay:
+            images = np.concatenate(images, axis=0)
+            import imageio
+            imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env))
+        return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns))
+    def collect_data(
+            self,
+            env_num: int = 8,
+            save_data_path: Optional[str] = None,
+            n_sample: Optional[int] = None,
+            n_episode: Optional[int] = None,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> None:
+        """
+        Overview:
+            Collect data with PPO algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        if n_episode is not None:
+            raise NotImplementedError
+        # define env and policy
+        env = self._setup_env_manager(env_num, context, debug, 'collector')
+        if save_data_path is None:
+            save_data_path = os.path.join(self.exp_name, 'demo_data')
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(PPOFStepCollector(self.seed, self.policy, env, n_sample))
+            task.use(offline_data_saver(save_data_path, data_type='hdf5'))
+            task.run(max_step=1)
+        logging.info(
+            f'PPOF collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`'
+        )
+    def batch_evaluate(
+            self,
+            env_num: int = 4,
+            n_evaluator_episode: int = 4,
+            context: Optional[str] = None,
+            debug: bool = False,
+    ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with PPO algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \
+            environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env = self._setup_env_manager(env_num, context, debug, 'evaluator')
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.launch()
+        env.reset()
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(interaction_evaluator_ttorch(
+                self.seed,
+                self.policy,
+                env,
+                n_evaluator_episode,
+            ))
+            task.run(max_step=1)
+        return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
+    def _setup_env_manager(
+            self,
+            env_num: int,
+            context: Optional[str] = None,
+            debug: bool = False,
+            caller: str = 'collector'
+    ) -> BaseEnvManagerV2:
+        """
+        Overview:
+            Setup the environment manager. The environment manager is used to manage multiple environments.
+        Arguments:
+            - env_num (:obj:`int`): The number of environments.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - caller (:obj:`str`): The caller of the environment manager. Default to 'collector'.
+        Returns:
+            - (:obj:`BaseEnvManagerV2`): The environment manager.
+        """
+        assert caller in ['evaluator', 'collector']
+        if debug:
+            env_cls = BaseEnvManagerV2
+            manager_cfg = env_cls.default_config()
+        else:
+            env_cls = SubprocessEnvManagerV2
+            manager_cfg = env_cls.default_config()
+            if context is not None:
+                manager_cfg.context = context
+        return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg)
+    @property
+    def best(self) -> 'PPOF':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`PPOF`): The agent with the best model.
+        Examples:
+            >>> agent = PPOF(env_id='LunarLander-v2')
+            >>> agent.train()
+            >>> agent = agent.best()
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+        best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
+        # Load best model if it exists
+        if os.path.exists(best_model_file_path):
+            policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu"))
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        return self

DI-engine/ding/bonus/sac.py ADDED Viewed

	@@ -0,0 +1,457 @@

+from typing import Optional, Union, List
+from ditk import logging
+from easydict import EasyDict
+import os
+import numpy as np
+import torch
+import treetensor.torch as ttorch
+from ding.framework import task, OnlineRLContext
+from ding.framework.middleware import CkptSaver, \
+    wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \
+    OffPolicyLearner, final_ctx_saver
+from ding.envs import BaseEnv
+from ding.envs import setup_ding_env_manager
+from ding.policy import SACPolicy
+from ding.utils import set_pkg_seed
+from ding.utils import get_env_fps, render
+from ding.config import save_config_py, compile_config
+from ding.model import ContinuousQAC
+from ding.model import model_wrap
+from ding.data import DequeBuffer
+from ding.bonus.common import TrainingReturn, EvalReturn
+from ding.config.example.SAC import supported_env_cfg
+from ding.config.example.SAC import supported_env
+class SACAgent:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \
+        Soft Actor-Critic(SAC).
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
+    supported_env_list = list(supported_env_cfg.keys())
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.sac import SACAgent
+        >>> print(SACAgent.supported_env_list)
+    """
+    def __init__(
+            self,
+            env_id: str = None,
+            env: BaseEnv = None,
+            seed: int = 0,
+            exp_name: str = None,
+            model: Optional[torch.nn.Module] = None,
+            cfg: Optional[Union[EasyDict, dict]] = None,
+            policy_state_dict: str = None,
+    ) -> None:
+        """
+        Overview:
+            Initialize agent for SAC algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of SAC algorithm, which should be an instance of class \
+                :class:`ding.model.ContinuousQAC`. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:Union[EasyDict, dict]): The configuration of SAC algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used. \
+                The default configuration can be found in ``ding/config/example/SAC/gym_lunarlander_v2.py``.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLanderContinuous-v2`` registered in gym, \
+            and we want to train an agent with SAC algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = SACAgent(env_id='LunarLanderContinuous-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLanderContinuous-v2'}, 'policy': ...... }
+                >>> agent = SACAgent(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLanderContinuous-v2')
+                >>> agent = SACAgent(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = ContinuousQAC(**cfg.policy.model)
+                >>> agent = SACAgent(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = SACAgent(cfg=cfg, policy_state_dict='LunarLanderContinuous-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+        assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
+        if cfg is not None and not isinstance(cfg, EasyDict):
+            cfg = EasyDict(cfg)
+        if env_id is not None:
+            assert env_id in SACAgent.supported_env_list, "Please use supported envs: {}".format(
+                SACAgent.supported_env_list
+            )
+            if cfg is None:
+                cfg = supported_env_cfg[env_id]
+            else:
+                assert cfg.env.env_id == env_id, "env_id in cfg should be the same as env_id in args."
+        else:
+            assert hasattr(cfg.env, "env_id"), "Please specify env_id in cfg."
+            assert cfg.env.env_id in SACAgent.supported_env_list, "Please use supported envs: {}".format(
+                SACAgent.supported_env_list
+            )
+        default_policy_config = EasyDict({"policy": SACPolicy.default_config()})
+        default_policy_config.update(cfg)
+        cfg = default_policy_config
+        if exp_name is not None:
+            cfg.exp_name = exp_name
+        self.cfg = compile_config(cfg, policy=SACPolicy)
+        self.exp_name = self.cfg.exp_name
+        if env is None:
+            self.env = supported_env[cfg.env.env_id](cfg=cfg.env)
+        else:
+            assert isinstance(env, BaseEnv), "Please use BaseEnv as env data type."
+            self.env = env
+        logging.getLogger().setLevel(logging.INFO)
+        self.seed = seed
+        set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda)
+        if not os.path.exists(self.exp_name):
+            os.makedirs(self.exp_name)
+        save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py'))
+        if model is None:
+            model = ContinuousQAC(**self.cfg.policy.model)
+        self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size)
+        self.policy = SACPolicy(self.cfg.policy, model=model)
+        if policy_state_dict is not None:
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt")
+    def train(
+        self,
+        step: int = int(1e7),
+        collector_env_num: int = None,
+        evaluator_env_num: int = None,
+        n_iter_save_ckpt: int = 1000,
+        context: Optional[str] = None,
+        debug: bool = False,
+        wandb_sweep: bool = False,
+    ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with SAC algorithm for ``step`` iterations with ``collector_env_num`` collector \
+            environments and ``evaluator_env_num`` evaluator environments. Information during training will be \
+            recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The collector environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        logging.debug(self.policy._model)
+        # define env and policy
+        collector_env_num = collector_env_num if collector_env_num else self.cfg.env.collector_env_num
+        evaluator_env_num = evaluator_env_num if evaluator_env_num else self.cfg.env.evaluator_env_num
+        collector_env = setup_ding_env_manager(self.env, collector_env_num, context, debug, 'collector')
+        evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator')
+        with task.start(ctx=OnlineRLContext()):
+            task.use(
+                interaction_evaluator(
+                    self.cfg,
+                    self.policy.eval_mode,
+                    evaluator_env,
+                    render=self.cfg.policy.eval.render if hasattr(self.cfg.policy.eval, "render") else False
+                )
+            )
+            task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt))
+            task.use(
+                StepCollector(
+                    self.cfg,
+                    self.policy.collect_mode,
+                    collector_env,
+                    random_collect_size=self.cfg.policy.random_collect_size
+                    if hasattr(self.cfg.policy, 'random_collect_size') else 0,
+                )
+            )
+            task.use(data_pusher(self.cfg, self.buffer_))
+            task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_))
+            task.use(
+                wandb_online_logger(
+                    metric_list=self.policy._monitor_vars_learn(),
+                    model=self.policy._model,
+                    anonymous=True,
+                    project_name=self.exp_name,
+                    wandb_sweep=wandb_sweep,
+                )
+            )
+            task.use(termination_checker(max_env_step=step))
+            task.use(final_ctx_saver(name=self.exp_name))
+            task.run()
+        return TrainingReturn(wandb_url=task.ctx.wandb_url)
+    def deploy(
+            self,
+            enable_save_replay: bool = False,
+            concatenate_all_replay: bool = False,
+            replay_save_path: str = None,
+            seed: Optional[Union[int, List]] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with SAC algorithm by interacting with the environment, during which the replay video \
+            can be saved if ``enable_save_replay`` is True. The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env = self.env.clone(caller='evaluator')
+        if seed is not None and isinstance(seed, int):
+            seeds = [seed]
+        elif seed is not None and isinstance(seed, list):
+            seeds = seed
+        else:
+            seeds = [self.seed]
+        returns = []
+        images = []
+        if enable_save_replay:
+            replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path
+            env.enable_save_replay(replay_path=replay_save_path)
+        else:
+            logging.warning('No video would be generated during the deploy.')
+            if concatenate_all_replay:
+                logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.')
+                concatenate_all_replay = False
+        def single_env_forward_wrapper(forward_fn, cuda=True):
+            forward_fn = model_wrap(forward_fn, wrapper_name='base').forward
+            def _forward(obs):
+                # unsqueeze means add batch dim, i.e. (O, ) -> (1, O)
+                obs = ttorch.as_tensor(obs).unsqueeze(0)
+                if cuda and torch.cuda.is_available():
+                    obs = obs.cuda()
+                (mu, sigma) = forward_fn(obs, mode='compute_actor')['logit']
+                action = torch.tanh(mu).detach().cpu().numpy()[0]  # deterministic_eval
+                return action
+            return _forward
+        forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda)
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.reset()
+        for seed in seeds:
+            env.seed(seed, dynamic_seed=False)
+            return_ = 0.
+            step = 0
+            obs = env.reset()
+            images.append(render(env)[None]) if concatenate_all_replay else None
+            while True:
+                action = forward_fn(obs)
+                obs, rew, done, info = env.step(action)
+                images.append(render(env)[None]) if concatenate_all_replay else None
+                return_ += rew
+                step += 1
+                if done:
+                    break
+            logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}')
+            returns.append(return_)
+        env.close()
+        if concatenate_all_replay:
+            images = np.concatenate(images, axis=0)
+            import imageio
+            imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env))
+        return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns))
+    def collect_data(
+            self,
+            env_num: int = 8,
+            save_data_path: Optional[str] = None,
+            n_sample: Optional[int] = None,
+            n_episode: Optional[int] = None,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> None:
+        """
+        Overview:
+            Collect data with SAC algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        if n_episode is not None:
+            raise NotImplementedError
+        # define env and policy
+        env_num = env_num if env_num else self.cfg.env.collector_env_num
+        env = setup_ding_env_manager(self.env, env_num, context, debug, 'collector')
+        if save_data_path is None:
+            save_data_path = os.path.join(self.exp_name, 'demo_data')
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(
+                StepCollector(
+                    self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size
+                )
+            )
+            task.use(offline_data_saver(save_data_path, data_type='hdf5'))
+            task.run(max_step=1)
+        logging.info(
+            f'SAC collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`'
+        )
+    def batch_evaluate(
+            self,
+            env_num: int = 4,
+            n_evaluator_episode: int = 4,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with SAC algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \
+            environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env_num = env_num if env_num else self.cfg.env.evaluator_env_num
+        env = setup_ding_env_manager(self.env, env_num, context, debug, 'evaluator')
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.launch()
+        env.reset()
+        evaluate_cfg = self.cfg
+        evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env))
+            task.run(max_step=1)
+        return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
+    @property
+    def best(self) -> 'SACAgent':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`SACAgent`): The agent with the best model.
+        Examples:
+            >>> agent = SACAgent(env_id='LunarLanderContinuous-v2')
+            >>> agent.train()
+            >>> agent = agent.best
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+        best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
+        # Load best model if it exists
+        if os.path.exists(best_model_file_path):
+            policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu"))
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        return self

DI-engine/ding/bonus/sql.py ADDED Viewed

	@@ -0,0 +1,461 @@

+from typing import Optional, Union, List
+from ditk import logging
+from easydict import EasyDict
+import os
+import numpy as np
+import torch
+import treetensor.torch as ttorch
+from ding.framework import task, OnlineRLContext
+from ding.framework.middleware import CkptSaver, \
+    wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \
+    OffPolicyLearner, final_ctx_saver, nstep_reward_enhancer, eps_greedy_handler
+from ding.envs import BaseEnv
+from ding.envs import setup_ding_env_manager
+from ding.policy import SQLPolicy
+from ding.utils import set_pkg_seed
+from ding.utils import get_env_fps, render
+from ding.config import save_config_py, compile_config
+from ding.model import DQN
+from ding.model import model_wrap
+from ding.data import DequeBuffer
+from ding.bonus.common import TrainingReturn, EvalReturn
+from ding.config.example.SQL import supported_env_cfg
+from ding.config.example.SQL import supported_env
+class SQLAgent:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \
+        Soft Q-Learning(SQL).
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
+    supported_env_list = list(supported_env_cfg.keys())
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.sql import SQLAgent
+        >>> print(SQLAgent.supported_env_list)
+    """
+    def __init__(
+            self,
+            env_id: str = None,
+            env: BaseEnv = None,
+            seed: int = 0,
+            exp_name: str = None,
+            model: Optional[torch.nn.Module] = None,
+            cfg: Optional[Union[EasyDict, dict]] = None,
+            policy_state_dict: str = None,
+    ) -> None:
+        """
+        Overview:
+            Initialize agent for SQL algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of SQL algorithm, which should be an instance of class \
+                :class:`ding.model.DQN`. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:Union[EasyDict, dict]): The configuration of SQL algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used. \
+                The default configuration can be found in ``ding/config/example/SQL/gym_lunarlander_v2.py``.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLander-v2`` registered in gym, \
+            and we want to train an agent with SQL algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = SQLAgent(env_id='LunarLander-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLander-v2'}, 'policy': ...... }
+                >>> agent = SQLAgent(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLander-v2')
+                >>> agent = SQLAgent(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = DQN(**cfg.policy.model)
+                >>> agent = SQLAgent(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = SQLAgent(cfg=cfg, policy_state_dict='LunarLander-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+        assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
+        if cfg is not None and not isinstance(cfg, EasyDict):
+            cfg = EasyDict(cfg)
+        if env_id is not None:
+            assert env_id in SQLAgent.supported_env_list, "Please use supported envs: {}".format(
+                SQLAgent.supported_env_list
+            )
+            if cfg is None:
+                cfg = supported_env_cfg[env_id]
+            else:
+                assert cfg.env.env_id == env_id, "env_id in cfg should be the same as env_id in args."
+        else:
+            assert hasattr(cfg.env, "env_id"), "Please specify env_id in cfg."
+            assert cfg.env.env_id in SQLAgent.supported_env_list, "Please use supported envs: {}".format(
+                SQLAgent.supported_env_list
+            )
+        default_policy_config = EasyDict({"policy": SQLPolicy.default_config()})
+        default_policy_config.update(cfg)
+        cfg = default_policy_config
+        if exp_name is not None:
+            cfg.exp_name = exp_name
+        self.cfg = compile_config(cfg, policy=SQLPolicy)
+        self.exp_name = self.cfg.exp_name
+        if env is None:
+            self.env = supported_env[cfg.env.env_id](cfg=cfg.env)
+        else:
+            assert isinstance(env, BaseEnv), "Please use BaseEnv as env data type."
+            self.env = env
+        logging.getLogger().setLevel(logging.INFO)
+        self.seed = seed
+        set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda)
+        if not os.path.exists(self.exp_name):
+            os.makedirs(self.exp_name)
+        save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py'))
+        if model is None:
+            model = DQN(**self.cfg.policy.model)
+        self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size)
+        self.policy = SQLPolicy(self.cfg.policy, model=model)
+        if policy_state_dict is not None:
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt")
+    def train(
+        self,
+        step: int = int(1e7),
+        collector_env_num: int = None,
+        evaluator_env_num: int = None,
+        n_iter_save_ckpt: int = 1000,
+        context: Optional[str] = None,
+        debug: bool = False,
+        wandb_sweep: bool = False,
+    ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with SQL algorithm for ``step`` iterations with ``collector_env_num`` collector \
+            environments and ``evaluator_env_num`` evaluator environments. Information during training will be \
+            recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The collector environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        logging.debug(self.policy._model)
+        # define env and policy
+        collector_env_num = collector_env_num if collector_env_num else self.cfg.env.collector_env_num
+        evaluator_env_num = evaluator_env_num if evaluator_env_num else self.cfg.env.evaluator_env_num
+        collector_env = setup_ding_env_manager(self.env, collector_env_num, context, debug, 'collector')
+        evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator')
+        with task.start(ctx=OnlineRLContext()):
+            task.use(
+                interaction_evaluator(
+                    self.cfg,
+                    self.policy.eval_mode,
+                    evaluator_env,
+                    render=self.cfg.policy.eval.render if hasattr(self.cfg.policy.eval, "render") else False
+                )
+            )
+            task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt))
+            task.use(eps_greedy_handler(self.cfg))
+            task.use(
+                StepCollector(
+                    self.cfg,
+                    self.policy.collect_mode,
+                    collector_env,
+                    random_collect_size=self.cfg.policy.random_collect_size
+                    if hasattr(self.cfg.policy, 'random_collect_size') else 0,
+                )
+            )
+            if "nstep" in self.cfg.policy and self.cfg.policy.nstep > 1:
+                task.use(nstep_reward_enhancer(self.cfg))
+            task.use(data_pusher(self.cfg, self.buffer_))
+            task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_))
+            task.use(
+                wandb_online_logger(
+                    metric_list=self.policy._monitor_vars_learn(),
+                    model=self.policy._model,
+                    anonymous=True,
+                    project_name=self.exp_name,
+                    wandb_sweep=wandb_sweep,
+                )
+            )
+            task.use(termination_checker(max_env_step=step))
+            task.use(final_ctx_saver(name=self.exp_name))
+            task.run()
+        return TrainingReturn(wandb_url=task.ctx.wandb_url)
+    def deploy(
+            self,
+            enable_save_replay: bool = False,
+            concatenate_all_replay: bool = False,
+            replay_save_path: str = None,
+            seed: Optional[Union[int, List]] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with SQL algorithm by interacting with the environment, during which the replay video \
+            can be saved if ``enable_save_replay`` is True. The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env = self.env.clone(caller='evaluator')
+        if seed is not None and isinstance(seed, int):
+            seeds = [seed]
+        elif seed is not None and isinstance(seed, list):
+            seeds = seed
+        else:
+            seeds = [self.seed]
+        returns = []
+        images = []
+        if enable_save_replay:
+            replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path
+            env.enable_save_replay(replay_path=replay_save_path)
+        else:
+            logging.warning('No video would be generated during the deploy.')
+            if concatenate_all_replay:
+                logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.')
+                concatenate_all_replay = False
+        def single_env_forward_wrapper(forward_fn, cuda=True):
+            forward_fn = model_wrap(forward_fn, wrapper_name='argmax_sample').forward
+            def _forward(obs):
+                # unsqueeze means add batch dim, i.e. (O, ) -> (1, O)
+                obs = ttorch.as_tensor(obs).unsqueeze(0)
+                if cuda and torch.cuda.is_available():
+                    obs = obs.cuda()
+                action = forward_fn(obs)["action"]
+                # squeeze means delete batch dim, i.e. (1, A) -> (A, )
+                action = action.squeeze(0).detach().cpu().numpy()
+                return action
+            return _forward
+        forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda)
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.reset()
+        for seed in seeds:
+            env.seed(seed, dynamic_seed=False)
+            return_ = 0.
+            step = 0
+            obs = env.reset()
+            images.append(render(env)[None]) if concatenate_all_replay else None
+            while True:
+                action = forward_fn(obs)
+                obs, rew, done, info = env.step(action)
+                images.append(render(env)[None]) if concatenate_all_replay else None
+                return_ += rew
+                step += 1
+                if done:
+                    break
+            logging.info(f'SQL deploy is finished, final episode return with {step} steps is: {return_}')
+            returns.append(return_)
+        env.close()
+        if concatenate_all_replay:
+            images = np.concatenate(images, axis=0)
+            import imageio
+            imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env))
+        return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns))
+    def collect_data(
+            self,
+            env_num: int = 8,
+            save_data_path: Optional[str] = None,
+            n_sample: Optional[int] = None,
+            n_episode: Optional[int] = None,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> None:
+        """
+        Overview:
+            Collect data with SQL algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        if n_episode is not None:
+            raise NotImplementedError
+        # define env and policy
+        env_num = env_num if env_num else self.cfg.env.collector_env_num
+        env = setup_ding_env_manager(self.env, env_num, context, debug, 'collector')
+        if save_data_path is None:
+            save_data_path = os.path.join(self.exp_name, 'demo_data')
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(
+                StepCollector(
+                    self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size
+                )
+            )
+            task.use(offline_data_saver(save_data_path, data_type='hdf5'))
+            task.run(max_step=1)
+        logging.info(
+            f'SQL collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`'
+        )
+    def batch_evaluate(
+            self,
+            env_num: int = 4,
+            n_evaluator_episode: int = 4,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with SQL algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \
+            environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env_num = env_num if env_num else self.cfg.env.evaluator_env_num
+        env = setup_ding_env_manager(self.env, env_num, context, debug, 'evaluator')
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.launch()
+        env.reset()
+        evaluate_cfg = self.cfg
+        evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env))
+            task.run(max_step=1)
+        return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
+    @property
+    def best(self) -> 'SQLAgent':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`SQLAgent`): The agent with the best model.
+        Examples:
+            >>> agent = SQLAgent(env_id='LunarLander-v2')
+            >>> agent.train()
+            >>> agent = agent.best
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+        best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
+        # Load best model if it exists
+        if os.path.exists(best_model_file_path):
+            policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu"))
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        return self

DI-engine/ding/bonus/td3.py ADDED Viewed

	@@ -0,0 +1,455 @@

+from typing import Optional, Union, List
+from ditk import logging
+from easydict import EasyDict
+import os
+import numpy as np
+import torch
+import treetensor.torch as ttorch
+from ding.framework import task, OnlineRLContext
+from ding.framework.middleware import CkptSaver, \
+    wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \
+    OffPolicyLearner, final_ctx_saver
+from ding.envs import BaseEnv
+from ding.envs import setup_ding_env_manager
+from ding.policy import TD3Policy
+from ding.utils import set_pkg_seed
+from ding.utils import get_env_fps, render
+from ding.config import save_config_py, compile_config
+from ding.model import ContinuousQAC
+from ding.data import DequeBuffer
+from ding.bonus.common import TrainingReturn, EvalReturn
+from ding.config.example.TD3 import supported_env_cfg
+from ding.config.example.TD3 import supported_env
+class TD3Agent:
+    """
+    Overview:
+        Class of agent for training, evaluation and deployment of Reinforcement learning algorithm \
+        Twin Delayed Deep Deterministic Policy Gradient(TD3).
+        For more information about the system design of RL agent, please refer to \
+        <https://di-engine-docs.readthedocs.io/en/latest/03_system/agent.html>.
+    Interface:
+        ``__init__``, ``train``, ``deploy``, ``collect_data``, ``batch_evaluate``, ``best``
+    """
+    supported_env_list = list(supported_env_cfg.keys())
+    """
+    Overview:
+        List of supported envs.
+    Examples:
+        >>> from ding.bonus.td3 import TD3Agent
+        >>> print(TD3Agent.supported_env_list)
+    """
+    def __init__(
+            self,
+            env_id: str = None,
+            env: BaseEnv = None,
+            seed: int = 0,
+            exp_name: str = None,
+            model: Optional[torch.nn.Module] = None,
+            cfg: Optional[Union[EasyDict, dict]] = None,
+            policy_state_dict: str = None,
+    ) -> None:
+        """
+        Overview:
+            Initialize agent for TD3 algorithm.
+        Arguments:
+            - env_id (:obj:`str`): The environment id, which is a registered environment name in gym or gymnasium. \
+                If ``env_id`` is not specified, ``env_id`` in ``cfg.env`` must be specified. \
+                If ``env_id`` is specified, ``env_id`` in ``cfg.env`` will be ignored. \
+                ``env_id`` should be one of the supported envs, which can be found in ``supported_env_list``.
+            - env (:obj:`BaseEnv`): The environment instance for training and evaluation. \
+                If ``env`` is not specified, `env_id`` or ``cfg.env.env_id`` must be specified. \
+                ``env_id`` or ``cfg.env.env_id`` will be used to create environment instance. \
+                If ``env`` is specified, ``env_id`` and ``cfg.env.env_id`` will be ignored.
+            - seed (:obj:`int`): The random seed, which is set before running the program. \
+                Default to 0.
+            - exp_name (:obj:`str`): The name of this experiment, which will be used to create the folder to save \
+                log data. Default to None. If not specified, the folder name will be ``env_id``-``algorithm``.
+            - model (:obj:`torch.nn.Module`): The model of TD3 algorithm, which should be an instance of class \
+                :class:`ding.model.ContinuousQAC`. \
+                If not specified, a default model will be generated according to the configuration.
+            - cfg (:obj:Union[EasyDict, dict]): The configuration of TD3 algorithm, which is a dict. \
+                Default to None. If not specified, the default configuration will be used. \
+                The default configuration can be found in ``ding/config/example/TD3/gym_lunarlander_v2.py``.
+            - policy_state_dict (:obj:`str`): The path of policy state dict saved by PyTorch a in local file. \
+                If specified, the policy will be loaded from this file. Default to None.
+        .. note::
+            An RL Agent Instance can be initialized in two basic ways. \
+            For example, we have an environment with id ``LunarLanderContinuous-v2`` registered in gym, \
+            and we want to train an agent with TD3 algorithm with default configuration. \
+            Then we can initialize the agent in the following ways:
+                >>> agent = TD3Agent(env_id='LunarLanderContinuous-v2')
+            or, if we want can specify the env_id in the configuration:
+                >>> cfg = {'env': {'env_id': 'LunarLanderContinuous-v2'}, 'policy': ...... }
+                >>> agent = TD3Agent(cfg=cfg)
+            There are also other arguments to specify the agent when initializing.
+            For example, if we want to specify the environment instance:
+                >>> env = CustomizedEnv('LunarLanderContinuous-v2')
+                >>> agent = TD3Agent(cfg=cfg, env=env)
+            or, if we want to specify the model:
+                >>> model = ContinuousQAC(**cfg.policy.model)
+                >>> agent = TD3Agent(cfg=cfg, model=model)
+            or, if we want to reload the policy from a saved policy state dict:
+                >>> agent = TD3Agent(cfg=cfg, policy_state_dict='LunarLanderContinuous-v2.pth.tar')
+            Make sure that the configuration is consistent with the saved policy state dict.
+        """
+        assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
+        if cfg is not None and not isinstance(cfg, EasyDict):
+            cfg = EasyDict(cfg)
+        if env_id is not None:
+            assert env_id in TD3Agent.supported_env_list, "Please use supported envs: {}".format(
+                TD3Agent.supported_env_list
+            )
+            if cfg is None:
+                cfg = supported_env_cfg[env_id]
+            else:
+                assert cfg.env.env_id == env_id, "env_id in cfg should be the same as env_id in args."
+        else:
+            assert hasattr(cfg.env, "env_id"), "Please specify env_id in cfg."
+            assert cfg.env.env_id in TD3Agent.supported_env_list, "Please use supported envs: {}".format(
+                TD3Agent.supported_env_list
+            )
+        default_policy_config = EasyDict({"policy": TD3Policy.default_config()})
+        default_policy_config.update(cfg)
+        cfg = default_policy_config
+        if exp_name is not None:
+            cfg.exp_name = exp_name
+        self.cfg = compile_config(cfg, policy=TD3Policy)
+        self.exp_name = self.cfg.exp_name
+        if env is None:
+            self.env = supported_env[cfg.env.env_id](cfg=cfg.env)
+        else:
+            assert isinstance(env, BaseEnv), "Please use BaseEnv as env data type."
+            self.env = env
+        logging.getLogger().setLevel(logging.INFO)
+        self.seed = seed
+        set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda)
+        if not os.path.exists(self.exp_name):
+            os.makedirs(self.exp_name)
+        save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py'))
+        if model is None:
+            model = ContinuousQAC(**self.cfg.policy.model)
+        self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size)
+        self.policy = TD3Policy(self.cfg.policy, model=model)
+        if policy_state_dict is not None:
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt")
+    def train(
+        self,
+        step: int = int(1e7),
+        collector_env_num: int = None,
+        evaluator_env_num: int = None,
+        n_iter_save_ckpt: int = 1000,
+        context: Optional[str] = None,
+        debug: bool = False,
+        wandb_sweep: bool = False,
+    ) -> TrainingReturn:
+        """
+        Overview:
+            Train the agent with TD3 algorithm for ``step`` iterations with ``collector_env_num`` collector \
+            environments and ``evaluator_env_num`` evaluator environments. Information during training will be \
+            recorded and saved by wandb.
+        Arguments:
+            - step (:obj:`int`): The total training environment steps of all collector environments. Default to 1e7.
+            - collector_env_num (:obj:`int`): The collector environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - evaluator_env_num (:obj:`int`): The evaluator environment number. Default to None. \
+                If not specified, it will be set according to the configuration.
+            - n_iter_save_ckpt (:obj:`int`): The frequency of saving checkpoint every training iteration. \
+                Default to 1000.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+            - wandb_sweep (:obj:`bool`): Whether to use wandb sweep, \
+                which is a hyper-parameter optimization process for seeking the best configurations. \
+                Default to False. If True, the wandb sweep id will be used as the experiment name.
+        Returns:
+            - (:obj:`TrainingReturn`): The training result, of which the attributions are:
+                - wandb_url (:obj:`str`): The weight & biases (wandb) project url of the trainning experiment.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        logging.debug(self.policy._model)
+        # define env and policy
+        collector_env_num = collector_env_num if collector_env_num else self.cfg.env.collector_env_num
+        evaluator_env_num = evaluator_env_num if evaluator_env_num else self.cfg.env.evaluator_env_num
+        collector_env = setup_ding_env_manager(self.env, collector_env_num, context, debug, 'collector')
+        evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator')
+        with task.start(ctx=OnlineRLContext()):
+            task.use(
+                interaction_evaluator(
+                    self.cfg,
+                    self.policy.eval_mode,
+                    evaluator_env,
+                    render=self.cfg.policy.eval.render if hasattr(self.cfg.policy.eval, "render") else False
+                )
+            )
+            task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt))
+            task.use(
+                StepCollector(
+                    self.cfg,
+                    self.policy.collect_mode,
+                    collector_env,
+                    random_collect_size=self.cfg.policy.random_collect_size
+                    if hasattr(self.cfg.policy, 'random_collect_size') else 0,
+                )
+            )
+            task.use(data_pusher(self.cfg, self.buffer_))
+            task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_))
+            task.use(
+                wandb_online_logger(
+                    metric_list=self.policy._monitor_vars_learn(),
+                    model=self.policy._model,
+                    anonymous=True,
+                    project_name=self.exp_name,
+                    wandb_sweep=wandb_sweep,
+                )
+            )
+            task.use(termination_checker(max_env_step=step))
+            task.use(final_ctx_saver(name=self.exp_name))
+            task.run()
+        return TrainingReturn(wandb_url=task.ctx.wandb_url)
+    def deploy(
+            self,
+            enable_save_replay: bool = False,
+            concatenate_all_replay: bool = False,
+            replay_save_path: str = None,
+            seed: Optional[Union[int, List]] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Deploy the agent with TD3 algorithm by interacting with the environment, during which the replay video \
+            can be saved if ``enable_save_replay`` is True. The evaluation result will be returned.
+        Arguments:
+            - enable_save_replay (:obj:`bool`): Whether to save the replay video. Default to False.
+            - concatenate_all_replay (:obj:`bool`): Whether to concatenate all replay videos into one video. \
+                Default to False. If ``enable_save_replay`` is False, this argument will be ignored. \
+                If ``enable_save_replay`` is True and ``concatenate_all_replay`` is False, \
+                the replay video of each episode will be saved separately.
+            - replay_save_path (:obj:`str`): The path to save the replay video. Default to None. \
+                If not specified, the video will be saved in ``exp_name/videos``.
+            - seed (:obj:`Union[int, List]`): The random seed, which is set before running the program. \
+                Default to None. If not specified, ``self.seed`` will be used. \
+                If ``seed`` is an integer, the agent will be deployed once. \
+                If ``seed`` is a list of integers, the agent will be deployed once for each seed in the list.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env = self.env.clone(caller='evaluator')
+        if seed is not None and isinstance(seed, int):
+            seeds = [seed]
+        elif seed is not None and isinstance(seed, list):
+            seeds = seed
+        else:
+            seeds = [self.seed]
+        returns = []
+        images = []
+        if enable_save_replay:
+            replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path
+            env.enable_save_replay(replay_path=replay_save_path)
+        else:
+            logging.warning('No video would be generated during the deploy.')
+            if concatenate_all_replay:
+                logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.')
+                concatenate_all_replay = False
+        def single_env_forward_wrapper(forward_fn, cuda=True):
+            def _forward(obs):
+                # unsqueeze means add batch dim, i.e. (O, ) -> (1, O)
+                obs = ttorch.as_tensor(obs).unsqueeze(0)
+                if cuda and torch.cuda.is_available():
+                    obs = obs.cuda()
+                action = forward_fn(obs, mode='compute_actor')["action"]
+                # squeeze means delete batch dim, i.e. (1, A) -> (A, )
+                action = action.squeeze(0).detach().cpu().numpy()
+                return action
+            return _forward
+        forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda)
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.reset()
+        for seed in seeds:
+            env.seed(seed, dynamic_seed=False)
+            return_ = 0.
+            step = 0
+            obs = env.reset()
+            images.append(render(env)[None]) if concatenate_all_replay else None
+            while True:
+                action = forward_fn(obs)
+                obs, rew, done, info = env.step(action)
+                images.append(render(env)[None]) if concatenate_all_replay else None
+                return_ += rew
+                step += 1
+                if done:
+                    break
+            logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}')
+            returns.append(return_)
+        env.close()
+        if concatenate_all_replay:
+            images = np.concatenate(images, axis=0)
+            import imageio
+            imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env))
+        return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns))
+    def collect_data(
+            self,
+            env_num: int = 8,
+            save_data_path: Optional[str] = None,
+            n_sample: Optional[int] = None,
+            n_episode: Optional[int] = None,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> None:
+        """
+        Overview:
+            Collect data with TD3 algorithm for ``n_episode`` episodes with ``env_num`` collector environments. \
+            The collected data will be saved in ``save_data_path`` if specified, otherwise it will be saved in \
+            ``exp_name/demo_data``.
+        Arguments:
+            - env_num (:obj:`int`): The number of collector environments. Default to 8.
+            - save_data_path (:obj:`str`): The path to save the collected data. Default to None. \
+                If not specified, the data will be saved in ``exp_name/demo_data``.
+            - n_sample (:obj:`int`): The number of samples to collect. Default to None. \
+                If not specified, ``n_episode`` must be specified.
+            - n_episode (:obj:`int`): The number of episodes to collect. Default to None. \
+                If not specified, ``n_sample`` must be specified.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        if n_episode is not None:
+            raise NotImplementedError
+        # define env and policy
+        env_num = env_num if env_num else self.cfg.env.collector_env_num
+        env = setup_ding_env_manager(self.env, env_num, context, debug, 'collector')
+        if save_data_path is None:
+            save_data_path = os.path.join(self.exp_name, 'demo_data')
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(
+                StepCollector(
+                    self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size
+                )
+            )
+            task.use(offline_data_saver(save_data_path, data_type='hdf5'))
+            task.run(max_step=1)
+        logging.info(
+            f'TD3 collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`'
+        )
+    def batch_evaluate(
+            self,
+            env_num: int = 4,
+            n_evaluator_episode: int = 4,
+            context: Optional[str] = None,
+            debug: bool = False
+    ) -> EvalReturn:
+        """
+        Overview:
+            Evaluate the agent with TD3 algorithm for ``n_evaluator_episode`` episodes with ``env_num`` evaluator \
+            environments. The evaluation result will be returned.
+            The difference between methods ``batch_evaluate`` and ``deploy`` is that ``batch_evaluate`` will create \
+            multiple evaluator environments to evaluate the agent to get an average performance, while ``deploy`` \
+            will only create one evaluator environment to evaluate the agent and save the replay video.
+        Arguments:
+            - env_num (:obj:`int`): The number of evaluator environments. Default to 4.
+            - n_evaluator_episode (:obj:`int`): The number of episodes to evaluate. Default to 4.
+            - context (:obj:`str`): The multi-process context of the environment manager. Default to None. \
+                It can be specified as ``spawn``, ``fork`` or ``forkserver``.
+            - debug (:obj:`bool`): Whether to use debug mode in the environment manager. Default to False. \
+                If set True, base environment manager will be used for easy debugging. Otherwise, \
+                subprocess environment manager will be used.
+        Returns:
+            - (:obj:`EvalReturn`): The evaluation result, of which the attributions are:
+                - eval_value (:obj:`np.float32`): The mean of evaluation return.
+                - eval_value_std (:obj:`np.float32`): The standard deviation of evaluation return.
+        """
+        if debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+        # define env and policy
+        env_num = env_num if env_num else self.cfg.env.evaluator_env_num
+        env = setup_ding_env_manager(self.env, env_num, context, debug, 'evaluator')
+        # reset first to make sure the env is in the initial state
+        # env will be reset again in the main loop
+        env.launch()
+        env.reset()
+        evaluate_cfg = self.cfg
+        evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode
+        # main execution task
+        with task.start(ctx=OnlineRLContext()):
+            task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env))
+            task.run(max_step=1)
+        return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std)
+    @property
+    def best(self) -> 'TD3Agent':
+        """
+        Overview:
+            Load the best model from the checkpoint directory, \
+            which is by default in folder ``exp_name/ckpt/eval.pth.tar``. \
+            The return value is the agent with the best model.
+        Returns:
+            - (:obj:`TD3Agent`): The agent with the best model.
+        Examples:
+            >>> agent = TD3Agent(env_id='LunarLanderContinuous-v2')
+            >>> agent.train()
+            >>> agent.best
+        .. note::
+            The best model is the model with the highest evaluation return. If this method is called, the current \
+            model will be replaced by the best model.
+        """
+        best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar")
+        # Load best model if it exists
+        if os.path.exists(best_model_file_path):
+            policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu"))
+            self.policy.learn_mode.load_state_dict(policy_state_dict)
+        return self

DI-engine/ding/compatibility.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+def torch_ge_131():
+    return int("".join(list(filter(str.isdigit, torch.__version__)))) >= 131
+def torch_ge_180():
+    return int("".join(list(filter(str.isdigit, torch.__version__)))) >= 180

DI-engine/ding/config/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .config import Config, read_config, save_config, compile_config, compile_config_parallel, read_config_directly, \
+    read_config_with_system, save_config_py
+from .utils import parallel_transform, parallel_transform_slurm
+from .example import A2C, C51, DDPG, DQN, PG, PPOF, PPOOffPolicy, SAC, SQL, TD3

DI-engine/ding/config/config.py ADDED Viewed

	@@ -0,0 +1,579 @@

+import os
+import os.path as osp
+import yaml
+import json
+import shutil
+import sys
+import time
+import tempfile
+import subprocess
+import datetime
+from importlib import import_module
+from typing import Optional, Tuple
+from easydict import EasyDict
+from copy import deepcopy
+from ding.utils import deep_merge_dicts, get_rank
+from ding.envs import get_env_cls, get_env_manager_cls, BaseEnvManager
+from ding.policy import get_policy_cls
+from ding.worker import BaseLearner, InteractionSerialEvaluator, BaseSerialCommander, Coordinator, \
+    AdvancedReplayBuffer, get_parallel_commander_cls, get_parallel_collector_cls, get_buffer_cls, \
+    get_serial_collector_cls, MetricSerialEvaluator, BattleInteractionSerialEvaluator
+from ding.reward_model import get_reward_model_cls
+from ding.world_model import get_world_model_cls
+from .utils import parallel_transform, parallel_transform_slurm, parallel_transform_k8s, save_config_formatted
+class Config(object):
+    r"""
+    Overview:
+        Base class for config.
+    Interface:
+        __init__, file_to_dict
+    Property:
+        cfg_dict
+    """
+    def __init__(
+            self,
+            cfg_dict: Optional[dict] = None,
+            cfg_text: Optional[str] = None,
+            filename: Optional[str] = None
+    ) -> None:
+        """
+        Overview:
+            Init method. Create config including dict type config and text type config.
+        Arguments:
+            - cfg_dict (:obj:`Optional[dict]`): dict type config
+            - cfg_text (:obj:`Optional[str]`): text type config
+            - filename (:obj:`Optional[str]`): config file name
+        """
+        if cfg_dict is None:
+            cfg_dict = {}
+        if not isinstance(cfg_dict, dict):
+            raise TypeError("invalid type for cfg_dict: {}".format(type(cfg_dict)))
+        self._cfg_dict = cfg_dict
+        if cfg_text:
+            text = cfg_text
+        elif filename:
+            with open(filename, 'r') as f:
+                text = f.read()
+        else:
+            text = '.'
+        self._text = text
+        self._filename = filename
+    @staticmethod
+    def file_to_dict(filename: str) -> 'Config':  # noqa
+        """
+        Overview:
+            Read config file and create config.
+        Arguments:
+            - filename (:obj:`Optional[str]`): config file name.
+        Returns:
+            - cfg_dict (:obj:`Config`): config class
+        """
+        cfg_dict, cfg_text = Config._file_to_dict(filename)
+        return Config(cfg_dict, cfg_text, filename=filename)
+    @staticmethod
+    def _file_to_dict(filename: str) -> Tuple[dict, str]:
+        """
+        Overview:
+            Read config file and convert the config file to dict type config and text type config.
+        Arguments:
+            - filename (:obj:`Optional[str]`): config file name.
+        Returns:
+            - cfg_dict (:obj:`Optional[dict]`): dict type config
+            - cfg_text (:obj:`Optional[str]`): text type config
+        """
+        filename = osp.abspath(osp.expanduser(filename))
+        # TODO check exist
+        # TODO check suffix
+        ext_name = osp.splitext(filename)[-1]
+        with tempfile.TemporaryDirectory() as temp_config_dir:
+            temp_config_file = tempfile.NamedTemporaryFile(dir=temp_config_dir, suffix=ext_name)
+            temp_config_name = osp.basename(temp_config_file.name)
+            temp_config_file.close()
+            shutil.copyfile(filename, temp_config_file.name)
+            temp_module_name = osp.splitext(temp_config_name)[0]
+            sys.path.insert(0, temp_config_dir)
+            # TODO validate py syntax
+            module = import_module(temp_module_name)
+            cfg_dict = {k: v for k, v in module.__dict__.items() if not k.startswith('_')}
+            del sys.modules[temp_module_name]
+            sys.path.pop(0)
+        cfg_text = filename + '\n'
+        with open(filename, 'r') as f:
+            cfg_text += f.read()
+        return cfg_dict, cfg_text
+    @property
+    def cfg_dict(self) -> dict:
+        return self._cfg_dict
+def read_config_yaml(path: str) -> EasyDict:
+    """
+    Overview:
+        read configuration from path
+    Arguments:
+        - path (:obj:`str`): Path of source yaml
+    Returns:
+        - (:obj:`EasyDict`): Config data from this file with dict type
+    """
+    with open(path, "r") as f:
+        config_ = yaml.safe_load(f)
+    return EasyDict(config_)
+def save_config_yaml(config_: dict, path: str) -> None:
+    """
+    Overview:
+        save configuration to path
+    Arguments:
+        - config (:obj:`dict`): Config dict
+        - path (:obj:`str`): Path of target yaml
+    """
+    config_string = json.dumps(config_)
+    with open(path, "w") as f:
+        yaml.safe_dump(json.loads(config_string), f)
+def save_config_py(config_: dict, path: str) -> None:
+    """
+    Overview:
+        save configuration to python file
+    Arguments:
+        - config (:obj:`dict`): Config dict
+        - path (:obj:`str`): Path of target yaml
+    """
+    # config_string = json.dumps(config_, indent=4)
+    config_string = str(config_)
+    from yapf.yapflib.yapf_api import FormatCode
+    config_string, _ = FormatCode(config_string)
+    config_string = config_string.replace('inf,', 'float("inf"),')
+    with open(path, "w") as f:
+        f.write('exp_config = ' + config_string)
+def read_config_directly(path: str) -> dict:
+    """
+    Overview:
+        Read configuration from a file path(now only support python file) and directly return results.
+    Arguments:
+        - path (:obj:`str`): Path of configuration file
+    Returns:
+        - cfg (:obj:`Tuple[dict, dict]`): Configuration dict.
+    """
+    suffix = path.split('.')[-1]
+    if suffix == 'py':
+        return Config.file_to_dict(path).cfg_dict
+    else:
+        raise KeyError("invalid config file suffix: {}".format(suffix))
+def read_config(path: str) -> Tuple[dict, dict]:
+    """
+    Overview:
+        Read configuration from a file path(now only suport python file). And select some proper parts.
+    Arguments:
+        - path (:obj:`str`): Path of configuration file
+    Returns:
+        - cfg (:obj:`Tuple[dict, dict]`): A collection(tuple) of configuration dict, divided into `main_config` and \
+            `create_cfg` two parts.
+    """
+    suffix = path.split('.')[-1]
+    if suffix == 'py':
+        cfg = Config.file_to_dict(path).cfg_dict
+        assert "main_config" in cfg, "Please make sure a 'main_config' variable is declared in config python file!"
+        assert "create_config" in cfg, "Please make sure a 'create_config' variable is declared in config python file!"
+        return cfg['main_config'], cfg['create_config']
+    else:
+        raise KeyError("invalid config file suffix: {}".format(suffix))
+def read_config_with_system(path: str) -> Tuple[dict, dict, dict]:
+    """
+    Overview:
+        Read configuration from a file path(now only suport python file). And select some proper parts
+    Arguments:
+        - path (:obj:`str`): Path of configuration file
+    Returns:
+        - cfg (:obj:`Tuple[dict, dict]`): A collection(tuple) of configuration dict, divided into `main_config`, \
+            `create_cfg` and `system_config` three parts.
+    """
+    suffix = path.split('.')[-1]
+    if suffix == 'py':
+        cfg = Config.file_to_dict(path).cfg_dict
+        assert "main_config" in cfg, "Please make sure a 'main_config' variable is declared in config python file!"
+        assert "create_config" in cfg, "Please make sure a 'create_config' variable is declared in config python file!"
+        assert "system_config" in cfg, "Please make sure a 'system_config' variable is declared in config python file!"
+        return cfg['main_config'], cfg['create_config'], cfg['system_config']
+    else:
+        raise KeyError("invalid config file suffix: {}".format(suffix))
+def save_config(config_: dict, path: str, type_: str = 'py', save_formatted: bool = False) -> None:
+    """
+    Overview:
+        save configuration to python file or yaml file
+    Arguments:
+        - config (:obj:`dict`): Config dict
+        - path (:obj:`str`): Path of target yaml or target python file
+        - type (:obj:`str`): If type is ``yaml`` , save configuration to yaml file. If type is ``py`` , save\
+             configuration to python file.
+        - save_formatted (:obj:`bool`): If save_formatted is true, save formatted config to path.\
+            Formatted config can be read by serial_pipeline directly.
+    """
+    assert type_ in ['yaml', 'py'], type_
+    if type_ == 'yaml':
+        save_config_yaml(config_, path)
+    elif type_ == 'py':
+        save_config_py(config_, path)
+        if save_formatted:
+            formated_path = osp.join(osp.dirname(path), 'formatted_' + osp.basename(path))
+            save_config_formatted(config_, formated_path)
+def compile_buffer_config(policy_cfg: EasyDict, user_cfg: EasyDict, buffer_cls: 'IBuffer') -> EasyDict:  # noqa
+    def _compile_buffer_config(policy_buffer_cfg, user_buffer_cfg, buffer_cls):
+        if buffer_cls is None:
+            assert 'type' in policy_buffer_cfg, "please indicate buffer type in create_cfg"
+            buffer_cls = get_buffer_cls(policy_buffer_cfg)
+        buffer_cfg = deep_merge_dicts(buffer_cls.default_config(), policy_buffer_cfg)
+        buffer_cfg = deep_merge_dicts(buffer_cfg, user_buffer_cfg)
+        return buffer_cfg
+    policy_multi_buffer = policy_cfg.other.replay_buffer.get('multi_buffer', False)
+    user_multi_buffer = user_cfg.policy.get('other', {}).get('replay_buffer', {}).get('multi_buffer', False)
+    assert not user_multi_buffer or user_multi_buffer == policy_multi_buffer, "For multi_buffer, \
+        user_cfg({}) and policy_cfg({}) must be in accordance".format(user_multi_buffer, policy_multi_buffer)
+    multi_buffer = policy_multi_buffer
+    if not multi_buffer:
+        policy_buffer_cfg = policy_cfg.other.replay_buffer
+        user_buffer_cfg = user_cfg.policy.get('other', {}).get('replay_buffer', {})
+        return _compile_buffer_config(policy_buffer_cfg, user_buffer_cfg, buffer_cls)
+    else:
+        return_cfg = EasyDict()
+        for buffer_name in policy_cfg.other.replay_buffer:  # Only traverse keys in policy_cfg
+            if buffer_name == 'multi_buffer':
+                continue
+            policy_buffer_cfg = policy_cfg.other.replay_buffer[buffer_name]
+            user_buffer_cfg = user_cfg.policy.get('other', {}).get('replay_buffer', {}).get('buffer_name', {})
+            if buffer_cls is None:
+                return_cfg[buffer_name] = _compile_buffer_config(policy_buffer_cfg, user_buffer_cfg, None)
+            else:
+                return_cfg[buffer_name] = _compile_buffer_config(
+                    policy_buffer_cfg, user_buffer_cfg, buffer_cls[buffer_name]
+                )
+            return_cfg[buffer_name].name = buffer_name
+        return return_cfg
+def compile_collector_config(
+        policy_cfg: EasyDict,
+        user_cfg: EasyDict,
+        collector_cls: 'ISerialCollector'  # noqa
+) -> EasyDict:
+    policy_collector_cfg = policy_cfg.collect.collector
+    user_collector_cfg = user_cfg.policy.get('collect', {}).get('collector', {})
+    # step1: get collector class
+    # two cases: create cfg merged in policy_cfg, collector class, and class has higher priority
+    if collector_cls is None:
+        assert 'type' in policy_collector_cfg, "please indicate collector type in create_cfg"
+        # use type to get collector_cls
+        collector_cls = get_serial_collector_cls(policy_collector_cfg)
+    # step2: policy collector cfg merge to collector cfg
+    collector_cfg = deep_merge_dicts(collector_cls.default_config(), policy_collector_cfg)
+    # step3: user collector cfg merge to the step2 config
+    collector_cfg = deep_merge_dicts(collector_cfg, user_collector_cfg)
+    return collector_cfg
+policy_config_template = dict(
+    model=dict(),
+    learn=dict(learner=dict()),
+    collect=dict(collector=dict()),
+    eval=dict(evaluator=dict()),
+    other=dict(replay_buffer=dict()),
+)
+policy_config_template = EasyDict(policy_config_template)
+env_config_template = dict(manager=dict(), stop_value=int(1e10), n_evaluator_episode=4)
+env_config_template = EasyDict(env_config_template)
+def save_project_state(exp_name: str) -> None:
+    def _fn(cmd: str):
+        return subprocess.run(cmd, shell=True, stdout=subprocess.PIPE).stdout.strip().decode("utf-8")
+    if subprocess.run("git status", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).returncode == 0:
+        short_sha = _fn("git describe --always")
+        log = _fn("git log --stat -n 5")
+        diff = _fn("git diff")
+        with open(os.path.join(exp_name, "git_log.txt"), "w", encoding='utf-8') as f:
+            f.write(short_sha + '\n\n' + log)
+        with open(os.path.join(exp_name, "git_diff.txt"), "w", encoding='utf-8') as f:
+            f.write(diff)
+def compile_config(
+        cfg: EasyDict,
+        env_manager: type = None,
+        policy: type = None,
+        learner: type = BaseLearner,
+        collector: type = None,
+        evaluator: type = InteractionSerialEvaluator,
+        buffer: type = None,
+        env: type = None,
+        reward_model: type = None,
+        world_model: type = None,
+        seed: int = 0,
+        auto: bool = False,
+        create_cfg: dict = None,
+        save_cfg: bool = True,
+        save_path: str = 'total_config.py',
+        renew_dir: bool = True,
+) -> EasyDict:
+    """
+    Overview:
+        Combine the input config information with other input information.
+        Compile config to make it easy to be called by other programs
+    Arguments:
+        - cfg (:obj:`EasyDict`): Input config dict which is to be used in the following pipeline
+        - env_manager (:obj:`type`): Env_manager class which is to be used in the following pipeline
+        - policy (:obj:`type`): Policy class which is to be used in the following pipeline
+        - learner (:obj:`type`): Input learner class, defaults to BaseLearner
+        - collector (:obj:`type`): Input collector class, defaults to BaseSerialCollector
+        - evaluator (:obj:`type`): Input evaluator class, defaults to InteractionSerialEvaluator
+        - buffer (:obj:`type`): Input buffer class, defaults to IBuffer
+        - env (:obj:`type`): Environment class which is to be used in the following pipeline
+        - reward_model (:obj:`type`): Reward model class which aims to offer various and valuable reward
+        - seed (:obj:`int`): Random number seed
+        - auto (:obj:`bool`): Compile create_config dict or not
+        - create_cfg (:obj:`dict`): Input create config dict
+        - save_cfg (:obj:`bool`): Save config or not
+        - save_path (:obj:`str`): Path of saving file
+        - renew_dir (:obj:`bool`): Whether to new a directory for saving config.
+    Returns:
+        - cfg (:obj:`EasyDict`): Config after compiling
+    """
+    cfg, create_cfg = deepcopy(cfg), deepcopy(create_cfg)
+    if auto:
+        assert create_cfg is not None
+        # for compatibility
+        if 'collector' not in create_cfg:
+            create_cfg.collector = EasyDict(dict(type='sample'))
+        if 'replay_buffer' not in create_cfg:
+            create_cfg.replay_buffer = EasyDict(dict(type='advanced'))
+            buffer = AdvancedReplayBuffer
+        if env is None:
+            if 'env' in create_cfg:
+                env = get_env_cls(create_cfg.env)
+            else:
+                env = None
+                create_cfg.env = {'type': 'ding_env_wrapper_generated'}
+        if env_manager is None:
+            env_manager = get_env_manager_cls(create_cfg.env_manager)
+        if policy is None:
+            policy = get_policy_cls(create_cfg.policy)
+        if 'default_config' in dir(env):
+            env_config = env.default_config()
+        else:
+            env_config = EasyDict()  # env does not have default_config
+        env_config = deep_merge_dicts(env_config_template, env_config)
+        env_config.update(create_cfg.env)
+        env_config.manager = deep_merge_dicts(env_manager.default_config(), env_config.manager)
+        env_config.manager.update(create_cfg.env_manager)
+        policy_config = policy.default_config()
+        policy_config = deep_merge_dicts(policy_config_template, policy_config)
+        policy_config.update(create_cfg.policy)
+        policy_config.collect.collector.update(create_cfg.collector)
+        if 'evaluator' in create_cfg:
+            policy_config.eval.evaluator.update(create_cfg.evaluator)
+        policy_config.other.replay_buffer.update(create_cfg.replay_buffer)
+        policy_config.other.commander = BaseSerialCommander.default_config()
+        if 'reward_model' in create_cfg:
+            reward_model = get_reward_model_cls(create_cfg.reward_model)
+            reward_model_config = reward_model.default_config()
+        else:
+            reward_model_config = EasyDict()
+        if 'world_model' in create_cfg:
+            world_model = get_world_model_cls(create_cfg.world_model)
+            world_model_config = world_model.default_config()
+            world_model_config.update(create_cfg.world_model)
+        else:
+            world_model_config = EasyDict()
+    else:
+        if 'default_config' in dir(env):
+            env_config = env.default_config()
+        else:
+            env_config = EasyDict()  # env does not have default_config
+        env_config = deep_merge_dicts(env_config_template, env_config)
+        if env_manager is None:
+            env_manager = BaseEnvManager  # for compatibility
+        env_config.manager = deep_merge_dicts(env_manager.default_config(), env_config.manager)
+        policy_config = policy.default_config()
+        policy_config = deep_merge_dicts(policy_config_template, policy_config)
+        if reward_model is None:
+            reward_model_config = EasyDict()
+        else:
+            reward_model_config = reward_model.default_config()
+        if world_model is None:
+            world_model_config = EasyDict()
+        else:
+            world_model_config = world_model.default_config()
+            world_model_config.update(create_cfg.world_model)
+    policy_config.learn.learner = deep_merge_dicts(
+        learner.default_config(),
+        policy_config.learn.learner,
+    )
+    if create_cfg is not None or collector is not None:
+        policy_config.collect.collector = compile_collector_config(policy_config, cfg, collector)
+    if evaluator:
+        policy_config.eval.evaluator = deep_merge_dicts(
+            evaluator.default_config(),
+            policy_config.eval.evaluator,
+        )
+    if create_cfg is not None or buffer is not None:
+        policy_config.other.replay_buffer = compile_buffer_config(policy_config, cfg, buffer)
+    default_config = EasyDict({'env': env_config, 'policy': policy_config})
+    if len(reward_model_config) > 0:
+        default_config['reward_model'] = reward_model_config
+    if len(world_model_config) > 0:
+        default_config['world_model'] = world_model_config
+    cfg = deep_merge_dicts(default_config, cfg)
+    if 'unroll_len' in cfg.policy:
+        cfg.policy.collect.unroll_len = cfg.policy.unroll_len
+    cfg.seed = seed
+    # check important key in config
+    if evaluator in [InteractionSerialEvaluator, BattleInteractionSerialEvaluator]:  # env interaction evaluation
+        cfg.policy.eval.evaluator.stop_value = cfg.env.stop_value
+        cfg.policy.eval.evaluator.n_episode = cfg.env.n_evaluator_episode
+    if 'exp_name' not in cfg:
+        cfg.exp_name = 'default_experiment'
+    if save_cfg and get_rank() == 0:
+        if os.path.exists(cfg.exp_name) and renew_dir:
+            cfg.exp_name += datetime.datetime.now().strftime("_%y%m%d_%H%M%S")
+        try:
+            os.makedirs(cfg.exp_name)
+        except FileExistsError:
+            pass
+        save_project_state(cfg.exp_name)
+        save_path = os.path.join(cfg.exp_name, save_path)
+        save_config(cfg, save_path, save_formatted=True)
+    return cfg
+def compile_config_parallel(
+        cfg: EasyDict,
+        create_cfg: EasyDict,
+        system_cfg: EasyDict,
+        seed: int = 0,
+        save_cfg: bool = True,
+        save_path: str = 'total_config.py',
+        platform: str = 'local',
+        coordinator_host: Optional[str] = None,
+        learner_host: Optional[str] = None,
+        collector_host: Optional[str] = None,
+        coordinator_port: Optional[int] = None,
+        learner_port: Optional[int] = None,
+        collector_port: Optional[int] = None,
+) -> EasyDict:
+    """
+    Overview:
+        Combine the input parallel mode configuration information with other input information. Compile config\
+             to make it easy to be called by other programs
+    Arguments:
+        - cfg (:obj:`EasyDict`): Input main config dict
+        - create_cfg (:obj:`dict`): Input create config dict, including type parameters, such as environment type
+        - system_cfg (:obj:`dict`): Input system config dict, including system parameters, such as file path,\
+            communication mode, use multiple GPUs or not
+        - seed (:obj:`int`): Random number seed
+        - save_cfg (:obj:`bool`): Save config or not
+        - save_path (:obj:`str`): Path of saving file
+        - platform (:obj:`str`): Where to run the program, 'local' or 'slurm'
+        - coordinator_host (:obj:`Optional[str]`): Input coordinator's host when platform is slurm
+        - learner_host (:obj:`Optional[str]`): Input learner's host when platform is slurm
+        - collector_host (:obj:`Optional[str]`): Input collector's host when platform is slurm
+    Returns:
+        - cfg (:obj:`EasyDict`): Config after compiling
+    """
+    # for compatibility
+    if 'replay_buffer' not in create_cfg:
+        create_cfg.replay_buffer = EasyDict(dict(type='advanced'))
+    # env
+    env = get_env_cls(create_cfg.env)
+    if 'default_config' in dir(env):
+        env_config = env.default_config()
+    else:
+        env_config = EasyDict()  # env does not have default_config
+    env_config = deep_merge_dicts(env_config_template, env_config)
+    env_config.update(create_cfg.env)
+    env_manager = get_env_manager_cls(create_cfg.env_manager)
+    env_config.manager = env_manager.default_config()
+    env_config.manager.update(create_cfg.env_manager)
+    # policy
+    policy = get_policy_cls(create_cfg.policy)
+    policy_config = policy.default_config()
+    policy_config = deep_merge_dicts(policy_config_template, policy_config)
+    cfg.policy.update(create_cfg.policy)
+    collector = get_parallel_collector_cls(create_cfg.collector)
+    policy_config.collect.collector = collector.default_config()
+    policy_config.collect.collector.update(create_cfg.collector)
+    policy_config.learn.learner = BaseLearner.default_config()
+    policy_config.learn.learner.update(create_cfg.learner)
+    commander = get_parallel_commander_cls(create_cfg.commander)
+    policy_config.other.commander = commander.default_config()
+    policy_config.other.commander.update(create_cfg.commander)
+    policy_config.other.replay_buffer.update(create_cfg.replay_buffer)
+    policy_config.other.replay_buffer = compile_buffer_config(policy_config, cfg, None)
+    default_config = EasyDict({'env': env_config, 'policy': policy_config})
+    cfg = deep_merge_dicts(default_config, cfg)
+    cfg.policy.other.commander.path_policy = system_cfg.path_policy  # league may use 'path_policy'
+    # system
+    for k in ['comm_learner', 'comm_collector']:
+        system_cfg[k] = create_cfg[k]
+    if platform == 'local':
+        cfg = parallel_transform(EasyDict({'main': cfg, 'system': system_cfg}))
+    elif platform == 'slurm':
+        cfg = parallel_transform_slurm(
+            EasyDict({
+                'main': cfg,
+                'system': system_cfg
+            }), coordinator_host, learner_host, collector_host
+        )
+    elif platform == 'k8s':
+        cfg = parallel_transform_k8s(
+            EasyDict({
+                'main': cfg,
+                'system': system_cfg
+            }),
+            coordinator_port=coordinator_port,
+            learner_port=learner_port,
+            collector_port=collector_port
+        )
+    else:
+        raise KeyError("not support platform type: {}".format(platform))
+    cfg.system.coordinator = deep_merge_dicts(Coordinator.default_config(), cfg.system.coordinator)
+    # seed
+    cfg.seed = seed
+    if save_cfg:
+        save_config(cfg, save_path)
+    return cfg

DI-engine/ding/config/example/A2C/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from easydict import EasyDict
+from . import gym_bipedalwalker_v3
+from . import gym_lunarlander_v2
+supported_env_cfg = {
+    gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.cfg,
+    gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg,
+}
+supported_env_cfg = EasyDict(supported_env_cfg)
+supported_env = {
+    gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.env,
+    gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.env,
+}
+supported_env = EasyDict(supported_env)

DI-engine/ding/config/example/A2C/gym_bipedalwalker_v3.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from easydict import EasyDict
+import ding.envs.gym_env
+cfg = dict(
+    exp_name='Bipedalwalker-v3-A2C',
+    seed=0,
+    env=dict(
+        env_id='BipedalWalker-v3',
+        collector_env_num=8,
+        evaluator_env_num=8,
+        n_evaluator_episode=8,
+        act_scale=True,
+        rew_clip=True,
+    ),
+    policy=dict(
+        cuda=True,
+        action_space='continuous',
+        model=dict(
+            action_space='continuous',
+            obs_shape=24,
+            action_shape=4,
+        ),
+        learn=dict(
+            batch_size=64,
+            learning_rate=0.0003,
+            value_weight=0.7,
+            entropy_weight=0.0005,
+            discount_factor=0.99,
+            adv_norm=True,
+        ),
+        collect=dict(
+            n_sample=64,
+            discount_factor=0.99,
+        ),
+    ),
+    wandb_logger=dict(
+        gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False
+    ),
+)
+cfg = EasyDict(cfg)
+env = ding.envs.gym_env.env

DI-engine/ding/config/example/A2C/gym_lunarlander_v2.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from easydict import EasyDict
+import ding.envs.gym_env
+cfg = dict(
+    exp_name='LunarLander-v2-A2C',
+    env=dict(
+        collector_env_num=8,
+        evaluator_env_num=8,
+        env_id='LunarLander-v2',
+        n_evaluator_episode=8,
+        stop_value=260,
+    ),
+    policy=dict(
+        cuda=True,
+        model=dict(
+            obs_shape=8,
+            action_shape=4,
+        ),
+        learn=dict(
+            batch_size=64,
+            learning_rate=3e-4,
+            entropy_weight=0.001,
+            adv_norm=True,
+        ),
+        collect=dict(
+            n_sample=64,
+            discount_factor=0.99,
+            gae_lambda=0.95,
+        ),
+    ),
+    wandb_logger=dict(
+        gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False
+    ),
+)
+cfg = EasyDict(cfg)
+env = ding.envs.gym_env.env

DI-engine/ding/config/example/C51/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from easydict import EasyDict
+from . import gym_lunarlander_v2
+from . import gym_pongnoframeskip_v4
+from . import gym_qbertnoframeskip_v4
+from . import gym_spaceInvadersnoframeskip_v4
+supported_env_cfg = {
+    gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg,
+    gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.cfg,
+    gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.cfg,
+    gym_spaceInvadersnoframeskip_v4.cfg.env.env_id: gym_spaceInvadersnoframeskip_v4.cfg,
+}
+supported_env_cfg = EasyDict(supported_env_cfg)
+supported_env = {
+    gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.env,
+    gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.env,
+    gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.env,
+    gym_spaceInvadersnoframeskip_v4.cfg.env.env_id: gym_spaceInvadersnoframeskip_v4.env,
+}
+supported_env = EasyDict(supported_env)

DI-engine/ding/config/example/C51/gym_lunarlander_v2.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from easydict import EasyDict
+import ding.envs.gym_env
+cfg = dict(
+    exp_name='lunarlander_c51',
+    seed=0,
+    env=dict(
+        collector_env_num=8,
+        evaluator_env_num=8,
+        env_id='LunarLander-v2',
+        n_evaluator_episode=8,
+        stop_value=260,
+    ),
+    policy=dict(
+        cuda=False,
+        model=dict(
+            obs_shape=8,
+            action_shape=4,
+            encoder_hidden_size_list=[512, 64],
+            v_min=-30,
+            v_max=30,
+            n_atom=51,
+        ),
+        discount_factor=0.99,
+        nstep=3,
+        learn=dict(
+            update_per_collect=10,
+            batch_size=64,
+            learning_rate=0.001,
+            target_update_freq=100,
+        ),
+        collect=dict(
+            n_sample=64,
+            unroll_len=1,
+        ),
+        other=dict(
+            eps=dict(
+                type='exp',
+                start=0.95,
+                end=0.1,
+                decay=50000,
+            ), replay_buffer=dict(replay_buffer_size=100000, )
+        ),
+    ),
+    wandb_logger=dict(
+        gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False
+    ),
+)
+cfg = EasyDict(cfg)
+env = ding.envs.gym_env.env

DI-engine/ding/config/example/C51/gym_pongnoframeskip_v4.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from easydict import EasyDict
+import ding.envs.gym_env
+cfg = dict(
+    exp_name='PongNoFrameskip-v4-C51',
+    seed=0,
+    env=dict(
+        collector_env_num=8,
+        evaluator_env_num=8,
+        n_evaluator_episode=8,
+        stop_value=30,
+        env_id='PongNoFrameskip-v4',
+        frame_stack=4,
+        env_wrapper='atari_default',
+    ),
+    policy=dict(
+        cuda=True,
+        priority=False,
+        model=dict(
+            obs_shape=[4, 84, 84],
+            action_shape=6,
+            encoder_hidden_size_list=[128, 128, 512],
+            v_min=-10,
+            v_max=10,
+            n_atom=51,
+        ),
+        nstep=3,
+        discount_factor=0.99,
+        learn=dict(
+            update_per_collect=10,
+            batch_size=32,
+            learning_rate=0.0001,
+            target_update_freq=500,
+        ),
+        collect=dict(n_sample=100, ),
+        eval=dict(evaluator=dict(eval_freq=4000, )),
+        other=dict(
+            eps=dict(
+                type='exp',
+                start=1.,
+                end=0.05,
+                decay=250000,
+            ),
+            replay_buffer=dict(replay_buffer_size=100000, ),
+        ),
+    ),
+    wandb_logger=dict(
+        gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False
+    ),
+)
+cfg = EasyDict(cfg)
+env = ding.envs.gym_env.env

DI-engine/ding/config/example/C51/gym_qbertnoframeskip_v4.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from easydict import EasyDict
+import ding.envs.gym_env
+cfg = dict(
+    exp_name='QbertNoFrameskip-v4-C51',
+    seed=0,
+    env=dict(
+        collector_env_num=8,
+        evaluator_env_num=8,
+        n_evaluator_episode=8,
+        stop_value=30000,
+        env_id='QbertNoFrameskip-v4',
+        frame_stack=4,
+        env_wrapper='atari_default',
+    ),
+    policy=dict(
+        cuda=True,
+        priority=True,
+        model=dict(
+            obs_shape=[4, 84, 84],
+            action_shape=6,
+            encoder_hidden_size_list=[128, 128, 512],
+            v_min=-10,
+            v_max=10,
+            n_atom=51,
+        ),
+        nstep=3,
+        discount_factor=0.99,
+        learn=dict(
+            update_per_collect=10,
+            batch_size=32,
+            learning_rate=0.0001,
+            target_update_freq=500,
+        ),
+        collect=dict(n_sample=100, ),
+        eval=dict(evaluator=dict(eval_freq=4000, )),
+        other=dict(
+            eps=dict(
+                type='exp',
+                start=1.,
+                end=0.05,
+                decay=1000000,
+            ),
+            replay_buffer=dict(replay_buffer_size=400000, ),
+        ),
+    ),
+    wandb_logger=dict(
+        gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False
+    ),
+)
+cfg = EasyDict(cfg)
+env = ding.envs.gym_env.env

DI-engine/ding/config/example/C51/gym_spaceInvadersnoframeskip_v4.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from easydict import EasyDict
+import ding.envs.gym_env
+cfg = dict(
+    exp_name='SpaceInvadersNoFrameskip-v4-C51',
+    seed=0,
+    env=dict(
+        collector_env_num=8,
+        evaluator_env_num=8,
+        n_evaluator_episode=8,
+        stop_value=10000000000,
+        env_id='SpaceInvadersNoFrameskip-v4',
+        frame_stack=4,
+        env_wrapper='atari_default',
+    ),
+    policy=dict(
+        cuda=True,
+        priority=False,
+        model=dict(
+            obs_shape=[4, 84, 84],
+            action_shape=6,
+            encoder_hidden_size_list=[128, 128, 512],
+            v_min=-10,
+            v_max=10,
+            n_atom=51,
+        ),
+        nstep=3,
+        discount_factor=0.99,
+        learn=dict(
+            update_per_collect=10,
+            batch_size=32,
+            learning_rate=0.0001,
+            target_update_freq=500,
+        ),
+        collect=dict(n_sample=100, ),
+        eval=dict(evaluator=dict(eval_freq=4000, )),
+        other=dict(
+            eps=dict(
+                type='exp',
+                start=1.,
+                end=0.05,
+                decay=1000000,
+            ),
+            replay_buffer=dict(replay_buffer_size=400000, ),
+        ),
+    ),
+    wandb_logger=dict(
+        gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False
+    ),
+)
+cfg = EasyDict(cfg)
+env = ding.envs.gym_env.env

DI-engine/ding/config/example/DDPG/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from easydict import EasyDict
+from . import gym_bipedalwalker_v3
+from . import gym_halfcheetah_v3
+from . import gym_hopper_v3
+from . import gym_lunarlandercontinuous_v2
+from . import gym_pendulum_v1
+from . import gym_walker2d_v3
+supported_env_cfg = {
+    gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.cfg,
+    gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.cfg,
+    gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.cfg,
+    gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.cfg,
+    gym_pendulum_v1.cfg.env.env_id: gym_pendulum_v1.cfg,
+    gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.cfg,
+}
+supported_env_cfg = EasyDict(supported_env_cfg)
+supported_env = {
+    gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.env,
+    gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.env,
+    gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.env,
+    gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.env,
+    gym_pendulum_v1.cfg.env.env_id: gym_pendulum_v1.env,
+    gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.env,
+}
+supported_env = EasyDict(supported_env)

DI-engine/ding/config/example/DDPG/gym_bipedalwalker_v3.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from easydict import EasyDict
+import ding.envs.gym_env
+cfg = dict(
+    exp_name='Bipedalwalker-v3-DDPG',
+    seed=0,
+    env=dict(
+        env_id='BipedalWalker-v3',
+        collector_env_num=8,
+        evaluator_env_num=5,
+        n_evaluator_episode=5,
+        act_scale=True,
+        rew_clip=True,
+    ),
+    policy=dict(
+        cuda=True,
+        random_collect_size=10000,
+        model=dict(
+            obs_shape=24,
+            action_shape=4,
+            twin_critic=False,
+            action_space='regression',
+            actor_head_hidden_size=400,
+            critic_head_hidden_size=400,
+        ),
+        learn=dict(
+            update_per_collect=64,
+            batch_size=256,
+            learning_rate_actor=0.0003,
+            learning_rate_critic=0.0003,
+            target_theta=0.005,
+            discount_factor=0.99,
+            learner=dict(hook=dict(log_show_after_iter=1000, ))
+        ),
+        collect=dict(n_sample=64, ),
+        other=dict(replay_buffer=dict(replay_buffer_size=300000, ), ),
+    ),
+    wandb_logger=dict(
+        gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False
+    ),
+)
+cfg = EasyDict(cfg)
+env = ding.envs.gym_env.env

DI-engine/ding/config/example/DDPG/gym_halfcheetah_v3.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from easydict import EasyDict
+import ding.envs.gym_env
+cfg = dict(
+    exp_name='HalfCheetah-v3-DDPG',
+    seed=0,
+    env=dict(
+        env_id='HalfCheetah-v3',
+        norm_obs=dict(use_norm=False, ),
+        norm_reward=dict(use_norm=False, ),
+        collector_env_num=1,
+        evaluator_env_num=8,
+        n_evaluator_episode=8,
+        stop_value=11000,
+        env_wrapper='mujoco_default',
+    ),
+    policy=dict(
+        cuda=True,
+        random_collect_size=25000,
+        model=dict(
+            obs_shape=17,
+            action_shape=6,
+            twin_critic=False,
+            actor_head_hidden_size=256,
+            critic_head_hidden_size=256,
+            action_space='regression',
+        ),
+        learn=dict(
+            update_per_collect=1,
+            batch_size=256,
+            learning_rate_actor=1e-3,
+            learning_rate_critic=1e-3,
+            ignore_done=True,
+            target_theta=0.005,
+            discount_factor=0.99,
+            actor_update_freq=1,
+            noise=False,
+        ),
+        collect=dict(
+            n_sample=1,
+            unroll_len=1,
+            noise_sigma=0.1,
+        ),
+        other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ),
+    ),
+    wandb_logger=dict(
+        gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False
+    ),
+)
+cfg = EasyDict(cfg)
+env = ding.envs.gym_env.env

DI-engine/ding/config/example/DDPG/gym_hopper_v3.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from easydict import EasyDict
+import ding.envs.gym_env
+cfg = dict(
+    exp_name='Hopper-v3-DDPG',
+    seed=0,
+    env=dict(
+        env_id='Hopper-v3',
+        norm_obs=dict(use_norm=False, ),
+        norm_reward=dict(use_norm=False, ),
+        collector_env_num=1,
+        evaluator_env_num=8,
+        n_evaluator_episode=8,
+        stop_value=6000,
+        env_wrapper='mujoco_default',
+    ),
+    policy=dict(
+        cuda=True,
+        random_collect_size=25000,
+        model=dict(
+            obs_shape=11,
+            action_shape=3,
+            twin_critic=False,
+            actor_head_hidden_size=256,
+            critic_head_hidden_size=256,
+            action_space='regression',
+        ),
+        learn=dict(
+            update_per_collect=1,
+            batch_size=256,
+            learning_rate_actor=1e-3,
+            learning_rate_critic=1e-3,
+            ignore_done=False,
+            target_theta=0.005,
+            discount_factor=0.99,
+            actor_update_freq=1,
+            noise=False,
+        ),
+        collect=dict(
+            n_sample=1,
+            unroll_len=1,
+            noise_sigma=0.1,
+        ),
+        other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ),
+    ),
+    wandb_logger=dict(
+        gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False
+    ),
+)
+cfg = EasyDict(cfg)
+env = ding.envs.gym_env.env

DI-engine/ding/config/example/DDPG/gym_lunarlandercontinuous_v2.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from easydict import EasyDict
+from functools import partial
+import ding.envs.gym_env
+cfg = dict(
+    exp_name='LunarLanderContinuous-V2-DDPG',
+    seed=0,
+    env=dict(
+        env_id='LunarLanderContinuous-v2',
+        collector_env_num=8,
+        evaluator_env_num=8,
+        n_evaluator_episode=8,
+        stop_value=260,
+        act_scale=True,
+    ),
+    policy=dict(
+        cuda=True,
+        random_collect_size=0,
+        model=dict(
+            obs_shape=8,
+            action_shape=2,
+            twin_critic=True,
+            action_space='regression',
+        ),
+        learn=dict(
+            update_per_collect=2,
+            batch_size=128,
+            learning_rate_actor=0.001,
+            learning_rate_critic=0.001,
+            ignore_done=False,  # TODO(pu)
+            # (int) When critic network updates once, how many times will actor network update.
+            # Delayed Policy Updates in original TD3 paper(https://arxiv.org/pdf/1802.09477.pdf).
+            # Default 1 for DDPG, 2 for TD3.
+            actor_update_freq=1,
+            # (bool) Whether to add noise on target network's action.
+            # Target Policy Smoothing Regularization in original TD3 paper(https://arxiv.org/pdf/1802.09477.pdf).
+            # Default True for TD3, False for DDPG.
+            noise=False,
+            noise_sigma=0.1,
+            noise_range=dict(
+                min=-0.5,
+                max=0.5,
+            ),
+        ),
+        collect=dict(
+            n_sample=48,
+            noise_sigma=0.1,
+            collector=dict(collect_print_freq=1000, ),
+        ),
+        eval=dict(evaluator=dict(eval_freq=100, ), ),
+        other=dict(replay_buffer=dict(replay_buffer_size=20000, ), ),
+    ),
+    wandb_logger=dict(
+        gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False
+    ),
+)
+cfg = EasyDict(cfg)
+env = partial(ding.envs.gym_env.env, continuous=True)

DI-engine/ding/config/example/DDPG/gym_pendulum_v1.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from easydict import EasyDict
+import ding.envs.gym_env
+cfg = dict(
+    exp_name='Pendulum-v1-DDPG',
+    seed=0,
+    env=dict(
+        env_id='Pendulum-v1',
+        collector_env_num=8,
+        evaluator_env_num=5,
+        n_evaluator_episode=5,
+        stop_value=-250,
+        act_scale=True,
+    ),
+    policy=dict(
+        cuda=False,
+        priority=False,
+        random_collect_size=800,
+        model=dict(
+            obs_shape=3,
+            action_shape=1,
+            twin_critic=False,
+            action_space='regression',
+        ),
+        learn=dict(
+            update_per_collect=2,
+            batch_size=128,
+            learning_rate_actor=0.001,
+            learning_rate_critic=0.001,
+            ignore_done=True,
+            actor_update_freq=1,
+            noise=False,
+        ),
+        collect=dict(
+            n_sample=48,
+            noise_sigma=0.1,
+            collector=dict(collect_print_freq=1000, ),
+        ),
+        eval=dict(evaluator=dict(eval_freq=100, )),
+        other=dict(replay_buffer=dict(
+            replay_buffer_size=20000,
+            max_use=16,
+        ), ),
+    ),
+    wandb_logger=dict(
+        gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False
+    ),
+)
+cfg = EasyDict(cfg)
+env = ding.envs.gym_env.env

DI-engine/ding/config/example/DDPG/gym_walker2d_v3.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from easydict import EasyDict
+import ding.envs.gym_env
+cfg = dict(
+    exp_name='Walker2d-v3-DDPG',
+    seed=0,
+    env=dict(
+        env_id='Walker2d-v3',
+        norm_obs=dict(use_norm=False, ),
+        norm_reward=dict(use_norm=False, ),
+        collector_env_num=1,
+        evaluator_env_num=8,
+        n_evaluator_episode=8,
+        stop_value=6000,
+        env_wrapper='mujoco_default',
+    ),
+    policy=dict(
+        cuda=True,
+        random_collect_size=25000,
+        model=dict(
+            obs_shape=17,
+            action_shape=6,
+            twin_critic=False,
+            actor_head_hidden_size=256,
+            critic_head_hidden_size=256,
+            action_space='regression',
+        ),
+        learn=dict(
+            update_per_collect=1,
+            batch_size=256,
+            learning_rate_actor=1e-3,
+            learning_rate_critic=1e-3,
+            ignore_done=False,
+            target_theta=0.005,
+            discount_factor=0.99,
+            actor_update_freq=1,
+            noise=False,
+        ),
+        collect=dict(
+            n_sample=1,
+            unroll_len=1,
+            noise_sigma=0.1,
+        ),
+        other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ),
+    ),
+    wandb_logger=dict(
+        gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False
+    ),
+)
+cfg = EasyDict(cfg)
+env = ding.envs.gym_env.env

DI-engine/ding/config/example/DQN/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from easydict import EasyDict
+from . import gym_lunarlander_v2
+from . import gym_pongnoframeskip_v4
+from . import gym_qbertnoframeskip_v4
+from . import gym_spaceInvadersnoframeskip_v4
+supported_env_cfg = {
+    gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg,
+    gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.cfg,
+    gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.cfg,
+    gym_spaceInvadersnoframeskip_v4.cfg.env.env_id: gym_spaceInvadersnoframeskip_v4.cfg,
+}
+supported_env_cfg = EasyDict(supported_env_cfg)
+supported_env = {
+    gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.env,
+    gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.env,
+    gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.env,
+    gym_spaceInvadersnoframeskip_v4.cfg.env.env_id: gym_spaceInvadersnoframeskip_v4.env,
+}
+supported_env = EasyDict(supported_env)

DI-engine/ding/config/example/DQN/gym_lunarlander_v2.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from easydict import EasyDict
+import ding.envs.gym_env
+cfg = dict(
+    exp_name='LunarLander-v2-DQN',
+    seed=0,
+    env=dict(
+        env_id='LunarLander-v2',
+        collector_env_num=8,
+        evaluator_env_num=8,
+        n_evaluator_episode=8,
+        stop_value=260,
+    ),
+    policy=dict(
+        cuda=True,
+        random_collect_size=25000,
+        discount_factor=0.99,
+        nstep=3,
+        learn=dict(
+            update_per_collect=10,
+            batch_size=64,
+            learning_rate=0.001,
+            # Frequency of target network update.
+            target_update_freq=100,
+        ),
+        model=dict(
+            obs_shape=8,
+            action_shape=4,
+            encoder_hidden_size_list=[512, 64],
+            # Whether to use dueling head.
+            dueling=True,
+        ),
+        collect=dict(
+            n_sample=64,
+            unroll_len=1,
+        ),
+        other=dict(
+            eps=dict(
+                type='exp',
+                start=0.95,
+                end=0.1,
+                decay=50000,
+            ), replay_buffer=dict(replay_buffer_size=100000, )
+        ),
+    ),
+    wandb_logger=dict(
+        gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False
+    ),
+)
+cfg = EasyDict(cfg)
+env = ding.envs.gym_env.env

DI-engine/ding/config/example/DQN/gym_pongnoframeskip_v4.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from easydict import EasyDict
+import ding.envs.gym_env
+cfg = dict(
+    exp_name='PongNoFrameskip-v4-DQN',
+    seed=0,
+    env=dict(
+        env_id='PongNoFrameskip-v4',
+        collector_env_num=8,
+        evaluator_env_num=8,
+        n_evaluator_episode=8,
+        stop_value=30,
+        fram_stack=4,
+        env_wrapper='atari_default',
+    ),
+    policy=dict(
+        cuda=True,
+        priority=False,
+        discount_factor=0.99,
+        nstep=3,
+        learn=dict(
+            update_per_collect=10,
+            batch_size=32,
+            learning_rate=0.0001,
+            # Frequency of target network update.
+            target_update_freq=500,
+        ),
+        model=dict(
+            obs_shape=[4, 84, 84],
+            action_shape=6,
+            encoder_hidden_size_list=[128, 128, 512],
+        ),
+        collect=dict(n_sample=96, ),
+        other=dict(
+            eps=dict(
+                type='exp',
+                start=1.,
+                end=0.05,
+                decay=250000,
+            ), replay_buffer=dict(replay_buffer_size=100000, )
+        ),
+    ),
+    wandb_logger=dict(
+        gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False
+    ),
+)
+cfg = EasyDict(cfg)
+env = ding.envs.gym_env.env