feat(python_ffi): 支持向 Stream 传入数据

YdrMaster · YdrMaster · commit a1109e74b14c · 2023-11-02T18:13:56.000+08:00
Signed-off-by: YdrMaster &lt;ydrml@hotmail.com&gt;
diff --git a/src/00common/include/common/fp16_t.h b/src/00common/include/common/fp16_t.h
@@ -99,7 +99,6 @@ namespace refactor {
         }
     };
 
-
     inline const fp16_t fp16_t::ZERO = fp16_t(0.0f);
     inline const fp16_t fp16_t::ONE = fp16_t(1.0f);
     inline const fp16_t fp16_t::INF = fp16_t((uint16_t) 0b0'11111'0000000000);
diff --git a/src/03runtime/include/runtime/stream.h b/src/03runtime/include/runtime/stream.h
@@ -21,7 +21,8 @@ namespace refactor::runtime {
         bool isBlob() const noexcept;
         bool isOffset() const noexcept;
 
-        size_t getOffset() const;
+        auto blob() const noexcept -> mem_manager::SharedForeignBlob const &;
+        auto offset() const noexcept -> size_t;
     };
 
     class Stream {
@@ -31,14 +32,19 @@ namespace refactor::runtime {
 
         Resources _resources;
         mem_manager::SharedForeignBlob _stack;
+        std::vector<size_t> _outputsSize;
         _G _internal;
 
     public:
         Stream(Resources,
-               mem_manager::SharedForeignBlob,
+               size_t stack,
+               std::vector<size_t> outputs,
                graph_topo::GraphTopo,
                std::vector<_N>,
                std::vector<_E>);
+        void setInput(uint_lv1, void const *, size_t);
+        void setInput(uint_lv1, mem_manager::SharedForeignBlob);
+        std::vector<uint_lv1> prepare();
         void run();
     };
 
diff --git a/src/03runtime/src/stream.cc b/src/03runtime/src/stream.cc
@@ -1,6 +1,8 @@
 ﻿#include "runtime/stream.h"
+#include "runtime/mem_manager.hh"
 
 namespace refactor::runtime {
+    using mem_manager::ForeignBlob;
 
     void emptyRoutine(runtime::Resources &, void const **, void **) {}
 
@@ -15,23 +17,62 @@ namespace refactor::runtime {
     bool Address::isOffset() const noexcept {
         return std::holds_alternative<size_t>(value);
     }
-
-    size_t Address::getOffset() const {
+    auto Address::blob() const noexcept -> mem_manager::SharedForeignBlob const & {
+        return std::get<mem_manager::SharedForeignBlob>(value);
+    }
+    auto Address::offset() const noexcept -> size_t {
         return std::get<size_t>(value);
     }
 
     Stream::Stream(Resources resources,
-                   mem_manager::SharedForeignBlob stack,
+                   size_t stack,
+                   std::vector<size_t> outputs,
                    graph_topo::GraphTopo topology,
                    std::vector<_N> routines,
                    std::vector<_E> offsets)
         : _resources(std::move(resources)),
-          _stack(std::move(stack)),
+          _stack(ForeignBlob::share(_resources.fetch<MemManager>()->manager, stack)),
+          _outputsSize(std::move(outputs)),
           _internal(_G{
               std::move(topology),
               std::move(routines),
               std::move(offsets),
-          }) {}
+          }) {
+    }
+
+    void Stream::setInput(uint_lv1 i, void const *data, size_t size) {
+        auto globalInputs = _internal.topology.globalInputs();
+        ASSERT(i < globalInputs.size(), "input index out of range");
+
+        auto allocator = _resources.fetch<MemManager>()->manager;
+        auto blob = ForeignBlob::share(std::move(allocator), size);
+        blob->copyIn(data, size);
+        _internal.edges[globalInputs[i]].value = {std::move(blob)};
+    }
+    void Stream::setInput(uint_lv1 i, mem_manager::SharedForeignBlob blob) {
+        auto globalInputs = _internal.topology.globalInputs();
+        ASSERT(i < globalInputs.size(), "input index out of range");
+
+        _internal.edges[globalInputs[i]].value = {std::move(blob)};
+    }
+
+    std::vector<uint_lv1> Stream::prepare() {
+        auto globalInputs = _internal.topology.globalInputs();
+        std::vector<uint_lv1> unknownInputs;
+        for (auto i : range0_(globalInputs.size())) {
+            if (!_internal.edges[globalInputs[i]].blob()) {
+                unknownInputs.push_back(i);
+            }
+        }
+        if (unknownInputs.empty()) {
+            auto allocator = _resources.fetch<MemManager>()->manager;
+            auto outputs = _internal.topology.globalOutputs();
+            for (auto i : range0_(outputs.size())) {
+                _internal.edges[outputs[i]].value = {ForeignBlob::share(allocator, _outputsSize[i])};
+            }
+        }
+        return unknownInputs;
+    }
 
     void Stream::run() {
         auto map = [this](auto i) { return _internal.edges[i](*_stack); };
diff --git a/src/04kernel/src/allocators/reusable_allocator.cpp b/src/04kernel/src/allocators/reusable_allocator.cpp
@@ -27,7 +27,7 @@ namespace refactor::kernel {
                 if (!--edgeRc[inputIdx]) {
                     // indicate that this tensor will no longer be used and perform memory free
                     if (addresses[inputIdx].isOffset()) {
-                        calculator.free(addresses[inputIdx].getOffset(), g.edges[inputIdx].size);
+                        calculator.free(addresses[inputIdx].offset(), g.edges[inputIdx].size);
                     }
                 }
             }
diff --git a/src/04kernel/src/graph.cc b/src/04kernel/src/graph.cc
@@ -24,13 +24,18 @@ namespace refactor::kernel {
                                       ? node.kernel->lower()
                                       : refactor::runtime::emptyRoutine;
                        });
-        auto [size, offsets] = allocator(_internal, sizeof(uint64_t));
-        auto memManager = _target.memManager();
+        auto [stack, offsets] = allocator(_internal, sizeof(uint64_t));
+        auto outputs = _internal.topology.globalOutputs();
+        std::vector<size_t> outputs_(outputs.size());
+        std::transform(outputs.begin(), outputs.end(),
+                       outputs_.begin(),
+                       [this](auto const &edge) { return _internal.edges[edge].size; });
         runtime::Resources res;
-        res.fetchOrStore<runtime::MemManager>(memManager);
+        res.fetchOrStore<runtime::MemManager>(_target.memManager());
         return runtime::Stream(
             std::move(res),
-            mem_manager::ForeignBlob::share(std::move(memManager), size),
+            stack,
+            std::move(outputs_),
             _internal.topology,
             std::move(routines),
             std::move(offsets));
diff --git a/src/09python_ffi/src/compiler.cc b/src/09python_ffi/src/compiler.cc
@@ -68,12 +68,11 @@ namespace refactor::python_ffi {
             UNREACHABLE();
         }
 
-        auto kernel = computation.lower(target_);
-        if (allocator == "flat") {
-            return std::make_shared<Executor>(kernel.lower(kernel::flatAllocate));
-        } else {
-            return std::make_shared<Executor>(kernel.lower(kernel::reusableAllocate));
-        }
+        return std::make_shared<Executor>(
+            computation.lower(target_),
+            allocator == "flat"
+                ? kernel::flatAllocate
+                : kernel::reusableAllocate);
     }
 
     std::optional<py::array>
diff --git a/src/09python_ffi/src/executor.cc b/src/09python_ffi/src/executor.cc
@@ -2,8 +2,17 @@
 
 namespace refactor::python_ffi {
 
-    Executor::Executor(runtime::Stream stream)
-        : _stream(std::move(stream)) {
+    Executor::Executor(kernel::Graph graph, kernel::Allocator allocator)
+        : _graph(std::move(graph)),
+          _allocator(allocator),
+          _stream(_graph.lower(_allocator)) {}
+
+    void Executor::setInput(uint_lv1 i, SharedTensor tensor) {
+        _stream.setInput(i, tensor->data->operator const void *(), tensor->bytesSize());
+    }
+
+    std::vector<uint_lv1> Executor::prepare() {
+        return _stream.prepare();
     }
 
 }// namespace refactor::python_ffi
diff --git a/src/09python_ffi/src/executor.h b/src/09python_ffi/src/executor.h
@@ -1,15 +1,21 @@
 ﻿#ifndef PYTHON_FFI_EXECUTOR_H
 #define PYTHON_FFI_EXECUTOR_H
 
-#include "runtime/stream.h"
+#include "frontend/tensor.h"
+#include "kernel/graph.h"
 
 namespace refactor::python_ffi {
+    using SharedTensor = Arc<frontend::Tensor>;
 
     class Executor {
+        kernel::Graph _graph;
+        kernel::Allocator _allocator;
         runtime::Stream _stream;
 
     public:
-        explicit Executor(runtime::Stream);
+        Executor(kernel::Graph, kernel::Allocator);
+        void setInput(uint_lv1, SharedTensor);
+        std::vector<uint_lv1> prepare();
     };
 
 }// namespace refactor::python_ffi
diff --git a/src/09python_ffi/src/main.cpp b/src/09python_ffi/src/main.cpp
@@ -33,7 +33,9 @@ namespace refactor::python_ffi {
             .def("get_tensor"      , &Compiler::getTensor        , return_::move      )
             .def("compile"         , &Compiler::compile          , return_::move      );
 
-        py::class_<Executor , Arc<Executor>>(m, "Executor" );
+        py::class_<Executor , Arc<Executor>>(m, "Executor" )
+            .def("setInput"        , &Executor::setInput         , return_::automatic )
+            .def("prepare"         , &Executor::prepare          , return_::move      );
 
         // clang-format on
     }

Original file line number	Diff line number	Diff line change
`@@ -99,7 +99,6 @@ namespace refactor {`
`99`	`99`	`}`
`100`	`100`	`};`
`101`	`101`
`102`		`-`
`103`	`102`	`inline const fp16_t fp16_t::ZERO = fp16_t(0.0f);`
`104`	`103`	`inline const fp16_t fp16_t::ONE = fp16_t(1.0f);`
`105`	`104`	`inline const fp16_t fp16_t::INF = fp16_t((uint16_t) 0b0'11111'0000000000);`
Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ namespace refactor::kernel {`
`27`	`27`	`if (!--edgeRc[inputIdx]) {`
`28`	`28`	`// indicate that this tensor will no longer be used and perform memory free`
`29`	`29`	`if (addresses[inputIdx].isOffset()) {`
`30`		`- calculator.free(addresses[inputIdx].getOffset(), g.edges[inputIdx].size);`
	`30`	`+ calculator.free(addresses[inputIdx].offset(), g.edges[inputIdx].size);`
`31`	`31`	`}`
`32`	`32`	`}`
`33`	`33`	`}`
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,9 @@ namespace refactor::python_ffi {`
`33`	`33`	`.def("get_tensor" , &Compiler::getTensor , return_::move )`
`34`	`34`	`.def("compile" , &Compiler::compile , return_::move );`
`35`	`35`
`36`		`- py::class_<Executor , Arc<Executor>>(m, "Executor" );`
	`36`	`+ py::class_<Executor , Arc<Executor>>(m, "Executor" )`
	`37`	`+ .def("setInput" , &Executor::setInput , return_::automatic )`
	`38`	`+ .def("prepare" , &Executor::prepare , return_::move );`
`37`	`39`
`38`	`40`	`// clang-format on`
`39`	`41`	`}`