diff --git a/Fig/gpu/animation.key b/Fig/gpu/animation.key new file mode 100755 index 0000000..61dbaca Binary files /dev/null and b/Fig/gpu/animation.key differ diff --git a/Fig/gpu/animation.png b/Fig/gpu/animation.png new file mode 100644 index 0000000..8cf7d7e Binary files /dev/null and b/Fig/gpu/animation.png differ diff --git a/Fig/gpu/gpu-driver-role.key b/Fig/gpu/gpu-driver-role.key index 1a3ae07..c5200fa 100755 Binary files a/Fig/gpu/gpu-driver-role.key and b/Fig/gpu/gpu-driver-role.key differ diff --git a/Fig/gpu/gpu-driver-role.png b/Fig/gpu/gpu-driver-role.png index 5a856ea..9ef7cbd 100644 Binary files a/Fig/gpu/gpu-driver-role.png and b/Fig/gpu/gpu-driver-role.png differ diff --git a/Fig/gpu/graphic-sw-stack.gv b/Fig/gpu/graphic-sw-stack.gv new file mode 100644 index 0000000..3f70862 --- /dev/null +++ b/Fig/gpu/graphic-sw-stack.gv @@ -0,0 +1,17 @@ +digraph G { + rankdir=LR; + + compound=true; + node [shape=record]; + subgraph cluster_cpu { + label = "CPU (Client)"; + CPU_SW [label=" 3D Model | JAVA | JOGL | { OpenGL API | Shaders \n (buitin-functions)} | Driver"]; + } + subgraph cluster_gpu { + label = "GPU HW (Server)" + GPU_SW [label=" 3D Rendering-pipeline \ndescribed in next section"]; + } + CPU_SW:f1 -> GPU_SW:f1 [label=" Frame data, \n shader-exectuable-code"]; + + label = "Graphic SW Stack"; +} diff --git a/Fig/gpu/opengl-flow.gv b/Fig/gpu/opengl-flow.gv new file mode 100644 index 0000000..59645d0 --- /dev/null +++ b/Fig/gpu/opengl-flow.gv @@ -0,0 +1,35 @@ +digraph G { + rankdir=LR; + + compound=true; + node [shape=record]; + subgraph cluster_3d { + label = "3D/2D modeling software"; + CodeGen [label="code-gen"]; + subgraph cluster_code { + label = "Generated Code"; + Api [label=" OpenGL API | Shaders"]; + } + Hand [label="hand-modifying"]; + } + subgraph cluster_driver { + label = "Driver" + Compiler [label="On-line Compiler"]; + Obj [label="obj"]; + Linker [label="On-line binding (Linker)"]; + Exe [label="exe"]; + } + CodeGen -> Api [lhead ="cluster_code"]; + Api -> Hand [ltail ="cluster_code"]; + Hand -> Api [lhead ="cluster_code"]; + Api:a -> Obj [lhead ="cluster_driver"]; + Api:s -> Compiler; + Compiler -> Obj; + Obj -> Linker; + Linker -> Exe; + Exe -> GPU; + Exe -> CPU [ltail ="cluster_driver"]; + + label = "OpenGL Flow"; +} + diff --git a/Fig/gpu/opengl_flow.odg b/Fig/gpu/opengl_flow.odg index 8341969..87a06ce 100644 Binary files a/Fig/gpu/opengl_flow.odg and b/Fig/gpu/opengl_flow.odg differ diff --git a/Fig/gpu/opengl_flow.png b/Fig/gpu/opengl_flow.png index dc56944..9e2868d 100644 Binary files a/Fig/gpu/opengl_flow.png and b/Fig/gpu/opengl_flow.png differ diff --git a/lbdex/clean.sh b/lbdex/clean.sh index aabb4c1..63433e9 100644 --- a/lbdex/clean.sh +++ b/lbdex/clean.sh @@ -6,5 +6,5 @@ popd pushd verilog make clean popd -rm -rf chapters preprocess tmp.txt +rm -rf output chapters preprocess tmp.txt diff --git a/lbdex/gen-docs-ref.sh b/lbdex/gen-docs-ref.sh index 72f819c..27d5ee7 100644 --- a/lbdex/gen-docs-ref.sh +++ b/lbdex/gen-docs-ref.sh @@ -2,6 +2,7 @@ pushd ./lbdex bash ./gen-chapters.sh -bash ./gen-ref-output.sh +# disable since removing output files of llvm-ir and asm from source/*.rst +#bash ./gen-ref-output.sh popd diff --git a/source/about.rst b/source/about.rst index 73e1019..93b9a9b 100644 --- a/source/about.rst +++ b/source/about.rst @@ -63,11 +63,15 @@ Revision history Version 12.0.14, not released yet. -Version 12.0.13.2, Released July 24, 2023. +Version 12.0.13.3, Released August 13, 2023. + + gpu.rst: animation, graphic-sw-stack.gv and opengl-flow.gv. + +Version 12.0.13.2, Released August 7, 2023. gpu.rst: Subsection of buffers, vao binding. -Version 12.0.13.1, Released August 7, 2023. +Version 12.0.13.1, Released July 24, 2023. gpu.rst: Section of Basic geometry in computer graphics, a x b = -b x a in 2D, The role of GPU driver. npu.rst: The role of GPU driver. diff --git a/source/c++.rst b/source/c++.rst index ce3c6f9..5d0aeb0 100644 --- a/source/c++.rst +++ b/source/c++.rst @@ -17,13 +17,24 @@ The Chapter11_2 can be built and run with the C++ polymorphism example code of ch12_inherit.cpp as follows, .. rubric:: lbdex/input/ch12_inherit.cpp -.. literalinclude:: ../lbdex/input/ch12_inherit.cpp - :start-after: /// start +.. code-block:: c++ + + ... + class CPolygon { // _ZTVN10__cxxabiv117__class_type_infoE for parent class + ... + #ifdef COUT_TEST + // generate IR nvoke, landing, resume and unreachable on iMac + { cout << this->area() << endl; } + #else + { printf("%d\n", this->area()); } + #endif + }; + ... If using cout instead of printf in ch12_inherit.cpp, it won't generate exception handler IRs on Linux, whereas it will generate invoke, landing, resume and unreachable exception handler IRs on iMac. -Example code, ch12_eh.cpp, which supports **try** and **catch** exception handler +Example code, ch12_eh.cpp, which includes **try** and **catch** exception handler as the following will generate these exception handler IRs both on iMac and Linux. .. rubric:: lbdex/input/ch12_eh.cpp @@ -37,7 +48,26 @@ as the following will generate these exception handler IRs both on iMac and Linu JonathantekiiMac:input Jonathan$ /Users/Jonathan/llvm/test/build/ bin/llvm-dis ch12_eh.bc -o - -.. literalinclude:: ../lbdex/output/ch12_eh.ll +.. rubric:: ../lbdex/output/ch12_eh.ll +.. code-block:: llvm + + ... + define dso_local i32 @_Z14test_try_catchv() #0 personality i8* bitcast (i32 (... + )* @__gxx_personality_v0 to i8*) { + entry: + ... + invoke void @_Z15throw_exceptionii(i32 signext 2, i32 signext 1) + to label %invoke.cont unwind label %lpad + + invoke.cont: ; preds = %entry + br label %try.cont + + lpad: ; preds = %entry + %0 = landingpad { i8*, i32 } + catch i8* null + ... + } + ... .. code:: console @@ -67,7 +97,40 @@ exception C++ keywords. It can compile ch12_eh.bc as follows, JonathantekiiMac:input Jonathan$ /Users/Jonathan/llvm/test/build/ bin/llc -march=cpu0 -relocation-model=static -filetype=asm ch12_eh.bc -o - -.. literalinclude:: ../lbdex/output/ch12_eh.cpu0.s +.. rubric:: ../lbdex/output/ch12_eh.cpu0.s +.. code:: text + + .type _Z14test_try_catchv,@function + .ent _Z14test_try_catchv # @_Z14test_try_catchv + _Z14test_try_catchv: + ... + $tmp0: + addiu $4, $zero, 2 + addiu $5, $zero, 1 + jsub _Z15throw_exceptionii + nop + $tmp1: + # %bb.1: # %invoke.cont + jmp $BB1_4 + $BB1_2: # %lpad + $tmp2: + st $4, 16($fp) + st $5, 12($fp) + # %bb.3: # %catch + ld $4, 16($fp) + jsub __cxa_begin_catch + nop + addiu $2, $zero, 1 + st $2, 20($fp) + jsub __cxa_end_catch + nop + jmp $BB1_5 + $BB1_4: # %try.cont + addiu $2, $zero, 0 + st $2, 20($fp) + $BB1_5: # %return + ld $2, 20($fp) + ... Thread variable @@ -237,33 +300,79 @@ programming. JonathantekiiMac:input Jonathan$ /Users/Jonathan/llvm/test/build/ bin/llvm-dis ch12_thread_var.bc -o - -.. literalinclude:: ../lbdex/output/ch12_thread_var.ll +.. rubric:: ../lbdex/output/ch12_thread_var.ll +.. code-block:: llvm + + ... + @a = dso_local thread_local global i32 0, align 4 + @b = dso_local thread_local global i32 0, align 4 + + ; Function Attrs: noinline nounwind optnone mustprogress + define dso_local i32 @_Z15test_thread_varv() #0 { + entry: + store i32 2, i32* @a, align 4 + %0 = load i32, i32* @a, align 4 + ret i32 %0 + } + + ; Function Attrs: noinline nounwind optnone mustprogress + define dso_local i32 @_Z17test_thread_var_2v() #0 { + entry: + store i32 3, i32* @b, align 4 + %0 = load i32, i32* @b, align 4 + ret i32 %0 + } + ... .. code-block:: console JonathantekiiMac:input Jonathan$ /Users/Jonathan/llvm/test/build/ bin/llc -march=cpu0 -relocation-model=pic -filetype=asm ch12_thread_var.bc - -o - + -o ch12_thread_var.cpu0.pic.s + JonathantekiiMac:input Jonathan$ cat ch12_thread_var.cpu0.pic.s -.. literalinclude:: ../lbdex/output/ch12_thread_var.cpu0.pic.s +.. rubric:: ../lbdex/output/ch12_thread_var.cpu0.pic.s +.. code-block:: text + ... + .ent _Z15test_thread_varv # @_Z15test_thread_varv + _Z15test_thread_varv: + ... + ori $4, $gp, %tlsldm(a) + ld $t9, %call16(__tls_get_addr)($gp) + jalr $t9 + nop + ld $gp, 8($fp) + lui $3, %dtp_hi(a) + addu $2, $3, $2 + ori $2, $2, %dtp_lo(a) + ... In pic mode, the __thread variable access by call function __tls_get_addr with the address of thread variable. The c++11 standard thread_local variable is accessed by calling function _ZTW1b which also call the function __tls_get_addr to get the thread_local variable address. -In static mode, the thread variable is accessed by machine instructions as -follows, +In static mode, the thread variable is accessed by getting address of thread +variables "a" and "b" with machine instructions as follows, .. code-block:: console JonathantekiiMac:input Jonathan$ /Users/Jonathan/llvm/test/build/ bin/llc -march=cpu0 -relocation-model=static -filetype=asm - ch12_thread_var.bc -o - + ch12_thread_var.bc -o ch12_thread_var.cpu0.static.s + JonathantekiiMac:input Jonathan$ cat ch12_thread_var.cpu0.static.s -.. literalinclude:: ../lbdex/output/ch12_thread_var.cpu0.static.s +.. rubric:: ../lbdex/output/ch12_thread_var.cpu0.static.s +.. code-block:: text + ... + lui $2, %tp_hi(a) + ori $2, $2, %tp_lo(a) + ... + lui $2, %tp_hi(b) + ori $2, $2, %tp_lo(b) + ... While Mips uses rdhwr instruction to access thread varaible as below, Cpu0 access thread varaible without inventing any new instruction. diff --git a/source/conf.py b/source/conf.py index b1f7a6f..1f537cd 100644 --- a/source/conf.py +++ b/source/conf.py @@ -65,9 +65,9 @@ # built documents. # # The short X.Y version. -version = u'12.0.13.2' +version = u'12.0.13.3' # The full version, including alpha/beta/rc tags. -release = u'12.0.13.2' +release = u'12.0.13.3' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/source/doc.rst b/source/doc.rst index c0354cd..6c0b72a 100644 --- a/source/doc.rst +++ b/source/doc.rst @@ -458,7 +458,7 @@ set-llvm-lit % `diff -r origin modify &> set-llvm-lit.diff` .. [#rst] http://docutils.sourceforge.net/rst.html -.. [#llvm-sphinx-quick] http://llvm.org/docs/SphinxQuickstartTemplate.html +.. [#llvm-sphinx-quick] http://llvm.org/docs/SphinxQuickstartTemplate.html If you need to show LLVM IR use the llvm code block. https://llvm.org/docs/SphinxQuickstartTemplate.html#code-blocks .. [#sphinx-lexers] http://pygments.org/docs/lexers/ diff --git a/source/gpu.rst b/source/gpu.rst index fa415bb..6a3d681 100644 --- a/source/gpu.rst +++ b/source/gpu.rst @@ -16,42 +16,9 @@ Since the 2D or 3D graphic processing provides large opportunity in parallel data processing, GPU hardware usually composed of thousands of functional units in each core(grid) in N-Vidia processors. -The flow for 3D/2D graphic processing as the following diagram. - -.. _opengl_flow: -.. figure:: ../Fig/gpu/opengl_flow.png - :align: center - :scale: 100 % - - OpenGL flow - -The driver run on CPU side as :numref:`gpu_driver_role`. The OpenGL Api will call -driver's function eventually and driver finish the function's work via issuing -GPU-HW's command and/or sending data to GPU. GPU's firmware only manage clock, -voltage, power comsumption, ..., etc [#gpu-firmware-jobs]_. -Even so, GPU's rendor work from the data of 3D vertex, colors, ... sending from -CPU and storing in GPU's memory or shared memory consume more computing power -than CPU. - -.. _gpu_driver_role: -.. figure:: ../Fig/gpu/gpu-driver-role.png - :align: center - :scale: 50 % - - The role of GPU driver - -- As above, every animation the client CPU program set new position of obect - (vertices) and colors, the data of one frame, server (driver and GPU) does - the 3D to 2D rendering. Higher-level - libraries and frameworks on top of OpenGL provide animation framework and - tools. - -- GPU can't directly read user input from, say, keyboard, mouse, gamepad, or - play audio, or load files from a hard drive, or anything like that. In this - situation, cannot let GPU handle the animation work [#cpu-gpu-role]_. - -This chapter is giving a concept for the flow above and focuses on shader compiler -for GPU. Furthermore, explaining how GPU has taking more applications from +This chapter is giving a overview for how 3D animation to be created and run on +CPU+GPU. Give a concept for GPU compiler and HW featrues for graphic application. +Furthermore, explaining how GPU has taking more applications from CPU through GPGPU concept and related standards emerged. @@ -75,6 +42,17 @@ Further, after texturing (texture mapping), the model looks real more [#texturemapping]_. To get to know how animation for a 3D modeling, please look video here [#animation1]_. +According the video for skeleton animation, setting the joints poistion at different +poses and giving time to each pose (keyframe) as :numref:`animation`. + +.. _animation: +.. figure:: ../Fig/gpu/animation.png + :align: center + :scale: 50 % + + Set time point at keyframes + + In this series of video, you find the 3D modeling tools creating Java instead of C/C++ code calling OpenGL api and shaders. It's because Java can call OpenGL api through a wrapper library [#joglwiki]_. @@ -102,6 +80,77 @@ with their type. VRML/X3D Neutral ============== ================== +The four key features a 3D file can store include the model’s geometry, the +model’s surface texture, scene details, and animation of the model [#3dfmt]_. + +Specifically, they can store details about four key features of a 3D model, +though it’s worth bearing in mind that you may not always take advantage of +all four features in all projects, and not all file formats support all four +features! + +3D printer applications do not to support animation. CAD and CAM such as +designing airplane does not need feature of scene details. + +DAE (Collada) appeared in the video animation above. +Collada files belong to a neutral format used heavily in the video game and +film industries. It’s managed by the non-profit technology consortium, the +Khronos Group. + +The file extension for the Collada format is .dae. +The Collada format stores data using the XML mark-up language. + +The original intention behind the Collada format was to become a standard among +3D file formats. Indeed, in 2013, it was adopted by ISO as a publicly available +specification, ISO/PAS 17506. As a result, many 3D modeling programs support +the Collada format. + +That said, the consensus is that the Collada format hasn’t kept up with the +times. It was once used heavily as an interchange format for Autodesk Max/Maya +in film production, but the industry has now shifted more towards OBJ, FBX, +and Alembic [#3dfmt]_. + + +Graphic SW stack +---------------- + +The driver run on CPU side as the following figure. The OpenGL Api will call +driver's function eventually and driver finish the function's work via issuing +GPU-HW's command and/or sending data to GPU. GPU's firmware only manage clock, +voltage, power comsumption, ..., etc [#gpu-firmware-jobs]_. +Even so, GPU's rendor work from the data of 3D vertex, colors, ... sending from +CPU and storing in GPU's memory or shared memory consume more computing power +than CPU. + +.. _graphic_sw_stack: +.. graphviz:: ../Fig/gpu/graphic-sw-stack.gv + +- According the previous section, after user create skeleton and skin for each + model and set keyframes time through 3D modeling tool, the 3D modeling tool + can either generate Java code which calling JOGL (Java OpenGL) [#joglwiki]_, + or generate OpenCL API directly. The frame data can be calculated from + interplation between keyframes. + +- As above, every animation the client CPU program set new position of obect + (vertices) and colors, the data of one frame, server (driver and GPU) does + the 3D to 2D rendering. Higher-level + libraries and frameworks on top of OpenGL provide animation framework and + tools to generate OpenGL API and shaders from 3D model. + +- Shader may call Builtin-functions which written from Compute Shader, spriv or + LLVM-IR. LLVM libclc is a project for builtin-functions in OpenCL which can + be used in OpenGL too [#libclc]_. + Like CPU's builtin-functions, new GPU ISA/architecture has to implement their + builtin-functions or porting from open source such as libclc. + +- GPU can't directly read user input from, say, keyboard, mouse, gamepad, or + play audio, or load files from a hard drive, or anything like that. In this + situation, cannot let GPU handle the animation work [#cpu-gpu-role]_. + +The flow for 3D/2D graphic processing as the following diagram. + +.. _opengl_flow: +.. graphviz:: ../Fig/gpu/opengl-flow.gv + Basic geometry in computer graphics ----------------------------------- @@ -1058,10 +1107,6 @@ Runtime from Open Source have chance to leverage the effort of scheduling SW fro programmers** [#paper-graph-on-opencl]_. Cuda graph is an idea like this [#cuda-graph-blog]_ [#cuda-graph-pytorch]_ . -.. [#gpu-firmware-jobs] https://antonelly.com.co/do-gpus-have-firmware/#:~:text=Providing%20access%20to%20new%20features,drivers%20during%20the%20boot%20process - -.. [#cpu-gpu-role] https://stackoverflow.com/questions/47426655/cpu-and-gpu-in-3d-game-whos-doing-what - .. [#polygon] https://www.quora.com/Which-one-is-better-for-3D-modeling-Quads-or-Tris .. [#shading] https://en.wikipedia.org/wiki/Shading @@ -1074,6 +1119,12 @@ programmers** [#paper-graph-on-opencl]_. Cuda graph is an idea like this .. [#3dfmt] https://all3dp.com/3d-file-format-3d-files-3d-printer-3d-cad-vrml-stl-obj/ +.. [#gpu-firmware-jobs] https://antonelly.com.co/do-gpus-have-firmware/#:~:text=Providing%20access%20to%20new%20features,drivers%20during%20the%20boot%20process + +.. [#libclc] https://libclc.llvm.org + +.. [#cpu-gpu-role] https://stackoverflow.com/questions/47426655/cpu-and-gpu-in-3d-game-whos-doing-what + .. [#wiki-quaternion] https://en.wikipedia.org/wiki/Quaternion .. [#cross-product-wiki] https://en.wikipedia.org/wiki/Cross_product