Support tensor cache while create tensor (#574)
Support tensor cache while create tensor Tensor can be shared between different operations, if tensor have identical data and quantization parameter, they should share same low-level tensor object to save memory. In tim-vx, introduce a tensor cache which key is md5sum and value is low-level tensor object. If up-coming tensor have same md5sum, the cached tensor object reused for tensor creation. Type: New feature Signed-off-by: Chen Xin <jack.chen@verisilicon.com> Co-authored-by: Chen Xin <jack.chen@verisilicon.com>
This commit is contained in:
parent
6e38e64a1a
commit
f1fd2246ae
|
|
@ -15,6 +15,7 @@ option(TIM_VX_ENABLE_PLATFORM "Enable multi devices support"
|
||||||
option(TIM_VX_ENABLE_PLATFORM_LITE "Enable lite multi-device support" OFF)
|
option(TIM_VX_ENABLE_PLATFORM_LITE "Enable lite multi-device support" OFF)
|
||||||
option(TIM_VX_ENABLE_GRPC "Enable gPRC support" OFF)
|
option(TIM_VX_ENABLE_GRPC "Enable gPRC support" OFF)
|
||||||
option(TIM_VX_DBG_ENABLE_TENSOR_HNDL "Enable built-in tensor from handle: use malloced memory instead of VideoMemory by kernel driver" ON)
|
option(TIM_VX_DBG_ENABLE_TENSOR_HNDL "Enable built-in tensor from handle: use malloced memory instead of VideoMemory by kernel driver" ON)
|
||||||
|
option(TIM_VX_ENABLE_TENSOR_CACHE "Enable tensor cache for const tensor" ON)
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 14)
|
set(CMAKE_CXX_STANDARD 14)
|
||||||
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
@ -46,6 +47,11 @@ if(${TIM_VX_ENABLE_40BIT})
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVSI_40BIT_VA_SUPPORT")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVSI_40BIT_VA_SUPPORT")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(${TIM_VX_ENABLE_TENSOR_CACHE})
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DENABLE_TENSOR_CACHE")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_TENSOR_CACHE")
|
||||||
|
endif()
|
||||||
|
|
||||||
if(${TIM_VX_ENABLE_CUSTOM_OP})
|
if(${TIM_VX_ENABLE_CUSTOM_OP})
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTIM_VX_ENABLE_CUSTOM_OP")
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTIM_VX_ENABLE_CUSTOM_OP")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTIM_VX_ENABLE_CUSTOM_OP")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTIM_VX_ENABLE_CUSTOM_OP")
|
||||||
|
|
@ -93,6 +99,9 @@ if(TIM_VX_ENABLE_GRPC)
|
||||||
include(cmake/gRPC.cmake)
|
include(cmake/gRPC.cmake)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(TIM_VX_ENABLE_TENSOR_CACHE)
|
||||||
|
find_package(OpenSSL REQUIRED)
|
||||||
|
endif()
|
||||||
add_subdirectory("src/tim")
|
add_subdirectory("src/tim")
|
||||||
|
|
||||||
if(TIM_VX_BUILD_EXAMPLES)
|
if(TIM_VX_BUILD_EXAMPLES)
|
||||||
|
|
|
||||||
|
|
@ -84,6 +84,7 @@ cmake options:
|
||||||
|`VIP_LITE_SDK` | full path to VIPLite sdk, required when `TIM_VX_ENABLE_PLATFORM_LITE`=ON | Not set |
|
|`VIP_LITE_SDK` | full path to VIPLite sdk, required when `TIM_VX_ENABLE_PLATFORM_LITE`=ON | Not set |
|
||||||
|`TIM_VX_ENABLE_GRPC` | Enable gPRC support, only work when `TIM_VX_ENABLE_PLATFORM`=ON | OFF |
|
|`TIM_VX_ENABLE_GRPC` | Enable gPRC support, only work when `TIM_VX_ENABLE_PLATFORM`=ON | OFF |
|
||||||
|`TIM_VX_DBG_ENABLE_TENSOR_HNDL` | Enable built-in tensor from handle | ON |
|
|`TIM_VX_DBG_ENABLE_TENSOR_HNDL` | Enable built-in tensor from handle | ON |
|
||||||
|
|`TIM_VX_ENABLE_TENSOR_CACHE` | Enable tensor cache for const tensor | ON |
|
||||||
|
|
||||||
----
|
----
|
||||||
Run unit test:
|
Run unit test:
|
||||||
|
|
|
||||||
|
|
@ -141,6 +141,10 @@ target_include_directories(${TARGET_NAME} PRIVATE ${INC_DIRS})
|
||||||
target_link_libraries(${TARGET_NAME} PUBLIC
|
target_link_libraries(${TARGET_NAME} PUBLIC
|
||||||
-Wl,--no-whole-archive ${OVXDRV_LIBRARIES} ${EXTERNAL_LIBS})
|
-Wl,--no-whole-archive ${OVXDRV_LIBRARIES} ${EXTERNAL_LIBS})
|
||||||
|
|
||||||
|
if(${TIM_VX_ENABLE_TENSOR_CACHE})
|
||||||
|
target_link_libraries(${TARGET_NAME} PUBLIC ${OPENSSL_CRYPTO_LIBRARY})
|
||||||
|
endif()
|
||||||
|
|
||||||
if(${TIM_VX_USE_EXTERNAL_OVXLIB})
|
if(${TIM_VX_USE_EXTERNAL_OVXLIB})
|
||||||
#-Wl,--whole-archive should not applied to external library, but only for shared library
|
#-Wl,--whole-archive should not applied to external library, but only for shared library
|
||||||
target_link_libraries(${TARGET_NAME} PUBLIC tim_internal)
|
target_link_libraries(${TARGET_NAME} PUBLIC tim_internal)
|
||||||
|
|
|
||||||
|
|
@ -171,3 +171,53 @@ TEST(GroupedConv2d, kernel_bigger_than_input_SAME) {
|
||||||
EXPECT_TRUE(infer_output->CopyDataFromTensor(output.data()));
|
EXPECT_TRUE(infer_output->CopyDataFromTensor(output.data()));
|
||||||
EXPECT_EQ(golden, output);
|
EXPECT_EQ(golden, output);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(FC, share_const_tensor) {
|
||||||
|
auto ctx = tim::vx::Context::Create();
|
||||||
|
auto src_graph = ctx->CreateGraph();
|
||||||
|
|
||||||
|
tim::vx::ShapeType input_shape({2, 1});
|
||||||
|
tim::vx::ShapeType kernel_shape({2, 2});
|
||||||
|
tim::vx::ShapeType bias_shape({2});
|
||||||
|
tim::vx::ShapeType output_shape({2, 1});
|
||||||
|
tim::vx::TensorSpec input_spec(tim::vx::DataType::FLOAT32, input_shape,
|
||||||
|
tim::vx::TensorAttribute::INPUT);
|
||||||
|
tim::vx::TensorSpec kernel_spec(tim::vx::DataType::FLOAT32, kernel_shape,
|
||||||
|
tim::vx::TensorAttribute::CONSTANT);
|
||||||
|
tim::vx::TensorSpec bias_spec(tim::vx::DataType::FLOAT32, bias_shape,
|
||||||
|
tim::vx::TensorAttribute::CONSTANT);
|
||||||
|
tim::vx::TensorSpec tran_spec(tim::vx::DataType::FLOAT32, output_shape,
|
||||||
|
tim::vx::TensorAttribute::TRANSIENT);
|
||||||
|
tim::vx::TensorSpec output_spec(tim::vx::DataType::FLOAT32, output_shape,
|
||||||
|
tim::vx::TensorAttribute::OUTPUT);
|
||||||
|
std::vector<float> in_data = {1,4,};
|
||||||
|
std::vector<float> weight = {-3,3,2,1,};
|
||||||
|
std::vector<float> bias = {0.1, 0.4,};
|
||||||
|
std::vector<float> golden = {-8, 25};
|
||||||
|
auto input_tensor = src_graph->CreateTensor(input_spec);
|
||||||
|
auto weight_tensor = src_graph->CreateTensor(kernel_spec, weight.data());
|
||||||
|
auto bias_tensor = src_graph->CreateTensor(bias_spec, bias.data());
|
||||||
|
auto tran_tensor = src_graph->CreateTensor(tran_spec);
|
||||||
|
auto output_tensor = src_graph->CreateTensor(output_spec);
|
||||||
|
|
||||||
|
auto op1 = src_graph->CreateOperation<tim::vx::ops::FullyConnected>(0,2);
|
||||||
|
(*op1).BindInputs({input_tensor, weight_tensor, bias_tensor}).BindOutputs({tran_tensor});
|
||||||
|
|
||||||
|
auto op2 = src_graph->CreateOperation<tim::vx::ops::FullyConnected>(0,2);
|
||||||
|
(*op2).BindInputs({tran_tensor, weight_tensor, bias_tensor}).BindOutputs({output_tensor});
|
||||||
|
// Do layout inference
|
||||||
|
auto transform = tim::transform::LayoutInference(src_graph, ctx);
|
||||||
|
auto infer_graph = transform.first;
|
||||||
|
auto graph_io_map = transform.second;
|
||||||
|
infer_graph->Compile();
|
||||||
|
|
||||||
|
auto infer_input = graph_io_map[src_graph->InputsTensor()[0]];
|
||||||
|
auto infer_output = graph_io_map[src_graph->OutputsTensor()[0]];
|
||||||
|
|
||||||
|
infer_input->CopyDataToTensor(in_data.data(), in_data.size() * sizeof(float));
|
||||||
|
infer_graph->Run();
|
||||||
|
|
||||||
|
std::vector<float> output(golden.size());
|
||||||
|
EXPECT_TRUE(infer_output->CopyDataFromTensor(output.data()));
|
||||||
|
EXPECT_EQ(golden, output);
|
||||||
|
}
|
||||||
|
|
@ -24,6 +24,10 @@
|
||||||
#include "tim/vx/graph.h"
|
#include "tim/vx/graph.h"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
|
#ifdef ENABLE_TENSOR_CACHE
|
||||||
|
#include <openssl/evp.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "context_private.h"
|
#include "context_private.h"
|
||||||
#include "graph_private.h"
|
#include "graph_private.h"
|
||||||
#include "op_impl.h"
|
#include "op_impl.h"
|
||||||
|
|
@ -55,6 +59,96 @@ GraphImpl::GraphImpl(ContextImpl* context, const CompileOption& options)
|
||||||
|
|
||||||
GraphImpl::~GraphImpl() { vsi_nn_ReleaseGraph(&graph_); }
|
GraphImpl::~GraphImpl() { vsi_nn_ReleaseGraph(&graph_); }
|
||||||
|
|
||||||
|
#ifdef ENABLE_TENSOR_CACHE
|
||||||
|
std::map<std::string, std::shared_ptr<tim::vx::Tensor>>& GraphImpl::GetTensorCacheMap() {
|
||||||
|
return cached_tensor_;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define MD5_SECRET_LEN_16 (16)
|
||||||
|
#define MD5_BYTE_STRING_LEN (4)
|
||||||
|
const std::string GraphImpl::caclulateMd5Secret32(const std::string& src) {
|
||||||
|
std::string md5String;
|
||||||
|
EVP_MD_CTX *mdctx;
|
||||||
|
const EVP_MD *md;
|
||||||
|
uint32_t md_len;
|
||||||
|
unsigned char md_value[MD5_SECRET_LEN_16] = {0};
|
||||||
|
char tmp[MD5_BYTE_STRING_LEN] = {0};
|
||||||
|
|
||||||
|
OpenSSL_add_all_digests();
|
||||||
|
md = EVP_md5();
|
||||||
|
if (md == NULL) {
|
||||||
|
VSILOGE("Unknown EVP_md5 message.");
|
||||||
|
}
|
||||||
|
mdctx = EVP_MD_CTX_new();
|
||||||
|
if (!EVP_DigestInit_ex(mdctx, md, NULL)) {
|
||||||
|
VSILOGE("EVP_MD_CTX initialization failed.");
|
||||||
|
EVP_MD_CTX_free(mdctx);
|
||||||
|
}
|
||||||
|
if (!EVP_DigestUpdate(mdctx, src.c_str(), src.size())) {
|
||||||
|
VSILOGE("EVP_MD_CTX update failed.");
|
||||||
|
EVP_MD_CTX_free(mdctx);
|
||||||
|
}
|
||||||
|
if (!EVP_DigestFinal_ex(mdctx, md_value, &md_len)) {
|
||||||
|
VSILOGE("EVP_MD_CTX finalization failed.");
|
||||||
|
EVP_MD_CTX_free(mdctx);
|
||||||
|
}
|
||||||
|
EVP_MD_CTX_free(mdctx);
|
||||||
|
|
||||||
|
for (int i = 0; i < MD5_SECRET_LEN_16; ++i) {
|
||||||
|
memset(tmp, 0x00, sizeof(tmp));
|
||||||
|
snprintf(tmp, sizeof(tmp), "%02X", md_value[i]);
|
||||||
|
md5String += tmp;
|
||||||
|
}
|
||||||
|
return md5String;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string GraphImpl::CaclulateCacheKey(const TensorSpec& spec, const void* data) {
|
||||||
|
std::string md5_key;
|
||||||
|
uint32_t data_size = 1;
|
||||||
|
for (auto it = spec.shape_.begin(); it != spec.shape_.end(); ++it) {
|
||||||
|
data_size *= *it;
|
||||||
|
}
|
||||||
|
switch (spec.datatype_) {
|
||||||
|
case DataType::INT16:
|
||||||
|
case DataType::UINT16:
|
||||||
|
case DataType::FLOAT16:
|
||||||
|
data_size *= 2;
|
||||||
|
break;
|
||||||
|
case DataType::INT32:
|
||||||
|
case DataType::UINT32:
|
||||||
|
case DataType::FLOAT32:
|
||||||
|
data_size *= 4;
|
||||||
|
break;
|
||||||
|
case DataType::INT64:
|
||||||
|
data_size *= 8;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (data_size < 512) {
|
||||||
|
md5_key = caclulateMd5Secret32(std::string((const char*)data, data_size));
|
||||||
|
} else {
|
||||||
|
md5_key = caclulateMd5Secret32(
|
||||||
|
std::string((const char*)data, 512)); //Take first 512 bytes
|
||||||
|
}
|
||||||
|
return md5_key;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<Tensor> GraphImpl::GetTensorFromCache(const TensorSpec& spec, const void* data) {
|
||||||
|
std::shared_ptr<tim::vx::Tensor> tensor;
|
||||||
|
std::string md5_key = CaclulateCacheKey(spec, data);
|
||||||
|
if (GetTensorCacheMap().find(md5_key) != GetTensorCacheMap().end() &&
|
||||||
|
GetTensorCacheMap()[md5_key]->GetQuantization().Scales() == spec.quantization_.Scales() &&
|
||||||
|
GetTensorCacheMap()[md5_key]->GetQuantization().ZeroPoints() == spec.quantization_.ZeroPoints()) {
|
||||||
|
tensor = GetTensorCacheMap()[md5_key];
|
||||||
|
} else {
|
||||||
|
tensor = std::make_shared<TensorImpl>(this, spec, data);
|
||||||
|
GetTensorCacheMap()[md5_key] = tensor;
|
||||||
|
}
|
||||||
|
return tensor;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
vsi_nn_graph_t* GraphImpl::graph() { return graph_; }
|
vsi_nn_graph_t* GraphImpl::graph() { return graph_; }
|
||||||
|
|
||||||
void GraphImpl::AddInput(vsi_nn_tensor_id_t id) {
|
void GraphImpl::AddInput(vsi_nn_tensor_id_t id) {
|
||||||
|
|
@ -135,6 +229,11 @@ void GraphImpl::PrintGraph() const { vsi_nn_PrintGraph(this->graph_); }
|
||||||
|
|
||||||
std::shared_ptr<Tensor> GraphImpl::CreateTensor(const TensorSpec& spec,
|
std::shared_ptr<Tensor> GraphImpl::CreateTensor(const TensorSpec& spec,
|
||||||
const void* data) {
|
const void* data) {
|
||||||
|
#ifdef ENABLE_TENSOR_CACHE
|
||||||
|
if (spec.attr_ & TensorAttribute::CONSTANT && data != NULL) {
|
||||||
|
return GetTensorFromCache(spec, data);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
auto tensor = std::make_shared<TensorImpl>(this, spec, data);
|
auto tensor = std::make_shared<TensorImpl>(this, spec, data);
|
||||||
if (spec.attr_ & TensorAttribute::INPUT) {
|
if (spec.attr_ & TensorAttribute::INPUT) {
|
||||||
this->AddInput(tensor);
|
this->AddInput(tensor);
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,7 @@
|
||||||
#include "tim/vx/graph.h"
|
#include "tim/vx/graph.h"
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
|
@ -43,7 +44,12 @@ class GraphImpl : public Graph {
|
||||||
public:
|
public:
|
||||||
GraphImpl(ContextImpl* context, const CompileOption& options = CompileOption::DefaultOptions);
|
GraphImpl(ContextImpl* context, const CompileOption& options = CompileOption::DefaultOptions);
|
||||||
~GraphImpl();
|
~GraphImpl();
|
||||||
|
#ifdef ENABLE_TENSOR_CACHE
|
||||||
|
std::shared_ptr<Tensor> GetTensorFromCache(const TensorSpec& spec, const void* data);
|
||||||
|
const std::string CaclulateCacheKey(const TensorSpec& spec, const void* data);
|
||||||
|
const std::string caclulateMd5Secret32(const std::string& src);
|
||||||
|
std::map<std::string, std::shared_ptr<tim::vx::Tensor>>& GetTensorCacheMap();
|
||||||
|
#endif
|
||||||
/// Return the low-level graph object
|
/// Return the low-level graph object
|
||||||
vsi_nn_graph_t* graph();
|
vsi_nn_graph_t* graph();
|
||||||
void AddInput(vsi_nn_tensor_id_t id);
|
void AddInput(vsi_nn_tensor_id_t id);
|
||||||
|
|
@ -97,7 +103,9 @@ class GraphImpl : public Graph {
|
||||||
int32_t not_consumed_output_cnt_;
|
int32_t not_consumed_output_cnt_;
|
||||||
std::map<std::shared_ptr<Tensor>, std::vector<std::shared_ptr<Operation>>> tensor_consumers_;
|
std::map<std::shared_ptr<Tensor>, std::vector<std::shared_ptr<Operation>>> tensor_consumers_;
|
||||||
std::map<std::shared_ptr<Tensor>, std::shared_ptr<Operation>> tensor_producer_;
|
std::map<std::shared_ptr<Tensor>, std::shared_ptr<Operation>> tensor_producer_;
|
||||||
|
#ifdef ENABLE_TENSOR_CACHE
|
||||||
|
std::map<std::string, std::shared_ptr<tim::vx::Tensor>> cached_tensor_;
|
||||||
|
#endif
|
||||||
CompileOption options_;
|
CompileOption options_;
|
||||||
private:
|
private:
|
||||||
/// Setup graph
|
/// Setup graph
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue