Support tensor cache while create tensor (#574)

Support tensor cache while create tensor

Tensor can be shared between different operations, if tensor have 
identical data and quantization parameter, they should share same
low-level tensor object to save memory.

In tim-vx, introduce a tensor cache which key is md5sum and value is 
low-level tensor object. If up-coming tensor have same md5sum, the
cached tensor object reused for tensor creation.

Type: New feature

Signed-off-by: Chen Xin <jack.chen@verisilicon.com>
Co-authored-by: Chen Xin <jack.chen@verisilicon.com>
This commit is contained in:
chxin66 2023-04-19 21:31:25 +08:00 committed by GitHub
parent 6e38e64a1a
commit f1fd2246ae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 173 additions and 2 deletions

View File

@ -15,6 +15,7 @@ option(TIM_VX_ENABLE_PLATFORM "Enable multi devices support"
option(TIM_VX_ENABLE_PLATFORM_LITE "Enable lite multi-device support" OFF) option(TIM_VX_ENABLE_PLATFORM_LITE "Enable lite multi-device support" OFF)
option(TIM_VX_ENABLE_GRPC "Enable gPRC support" OFF) option(TIM_VX_ENABLE_GRPC "Enable gPRC support" OFF)
option(TIM_VX_DBG_ENABLE_TENSOR_HNDL "Enable built-in tensor from handle: use malloced memory instead of VideoMemory by kernel driver" ON) option(TIM_VX_DBG_ENABLE_TENSOR_HNDL "Enable built-in tensor from handle: use malloced memory instead of VideoMemory by kernel driver" ON)
option(TIM_VX_ENABLE_TENSOR_CACHE "Enable tensor cache for const tensor" ON)
set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD 14)
set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON)
@ -46,6 +47,11 @@ if(${TIM_VX_ENABLE_40BIT})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVSI_40BIT_VA_SUPPORT") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVSI_40BIT_VA_SUPPORT")
endif() endif()
if(${TIM_VX_ENABLE_TENSOR_CACHE})
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DENABLE_TENSOR_CACHE")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_TENSOR_CACHE")
endif()
if(${TIM_VX_ENABLE_CUSTOM_OP}) if(${TIM_VX_ENABLE_CUSTOM_OP})
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTIM_VX_ENABLE_CUSTOM_OP") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTIM_VX_ENABLE_CUSTOM_OP")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTIM_VX_ENABLE_CUSTOM_OP") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTIM_VX_ENABLE_CUSTOM_OP")
@ -93,6 +99,9 @@ if(TIM_VX_ENABLE_GRPC)
include(cmake/gRPC.cmake) include(cmake/gRPC.cmake)
endif() endif()
if(TIM_VX_ENABLE_TENSOR_CACHE)
find_package(OpenSSL REQUIRED)
endif()
add_subdirectory("src/tim") add_subdirectory("src/tim")
if(TIM_VX_BUILD_EXAMPLES) if(TIM_VX_BUILD_EXAMPLES)

View File

@ -84,6 +84,7 @@ cmake options:
|`VIP_LITE_SDK` | full path to VIPLite sdk, required when `TIM_VX_ENABLE_PLATFORM_LITE`=ON | Not set | |`VIP_LITE_SDK` | full path to VIPLite sdk, required when `TIM_VX_ENABLE_PLATFORM_LITE`=ON | Not set |
|`TIM_VX_ENABLE_GRPC` | Enable gPRC support, only work when `TIM_VX_ENABLE_PLATFORM`=ON | OFF | |`TIM_VX_ENABLE_GRPC` | Enable gPRC support, only work when `TIM_VX_ENABLE_PLATFORM`=ON | OFF |
|`TIM_VX_DBG_ENABLE_TENSOR_HNDL` | Enable built-in tensor from handle | ON | |`TIM_VX_DBG_ENABLE_TENSOR_HNDL` | Enable built-in tensor from handle | ON |
|`TIM_VX_ENABLE_TENSOR_CACHE` | Enable tensor cache for const tensor | ON |
---- ----
Run unit test: Run unit test:

View File

@ -141,6 +141,10 @@ target_include_directories(${TARGET_NAME} PRIVATE ${INC_DIRS})
target_link_libraries(${TARGET_NAME} PUBLIC target_link_libraries(${TARGET_NAME} PUBLIC
-Wl,--no-whole-archive ${OVXDRV_LIBRARIES} ${EXTERNAL_LIBS}) -Wl,--no-whole-archive ${OVXDRV_LIBRARIES} ${EXTERNAL_LIBS})
if(${TIM_VX_ENABLE_TENSOR_CACHE})
target_link_libraries(${TARGET_NAME} PUBLIC ${OPENSSL_CRYPTO_LIBRARY})
endif()
if(${TIM_VX_USE_EXTERNAL_OVXLIB}) if(${TIM_VX_USE_EXTERNAL_OVXLIB})
#-Wl,--whole-archive should not applied to external library, but only for shared library #-Wl,--whole-archive should not applied to external library, but only for shared library
target_link_libraries(${TARGET_NAME} PUBLIC tim_internal) target_link_libraries(${TARGET_NAME} PUBLIC tim_internal)

View File

@ -167,6 +167,56 @@ TEST(GroupedConv2d, kernel_bigger_than_input_SAME) {
infer_input->CopyDataToTensor(in_data.data(), in_data.size() * sizeof(float)); infer_input->CopyDataToTensor(in_data.data(), in_data.size() * sizeof(float));
infer_graph->Run(); infer_graph->Run();
std::vector<float> output(golden.size());
EXPECT_TRUE(infer_output->CopyDataFromTensor(output.data()));
EXPECT_EQ(golden, output);
}
TEST(FC, share_const_tensor) {
auto ctx = tim::vx::Context::Create();
auto src_graph = ctx->CreateGraph();
tim::vx::ShapeType input_shape({2, 1});
tim::vx::ShapeType kernel_shape({2, 2});
tim::vx::ShapeType bias_shape({2});
tim::vx::ShapeType output_shape({2, 1});
tim::vx::TensorSpec input_spec(tim::vx::DataType::FLOAT32, input_shape,
tim::vx::TensorAttribute::INPUT);
tim::vx::TensorSpec kernel_spec(tim::vx::DataType::FLOAT32, kernel_shape,
tim::vx::TensorAttribute::CONSTANT);
tim::vx::TensorSpec bias_spec(tim::vx::DataType::FLOAT32, bias_shape,
tim::vx::TensorAttribute::CONSTANT);
tim::vx::TensorSpec tran_spec(tim::vx::DataType::FLOAT32, output_shape,
tim::vx::TensorAttribute::TRANSIENT);
tim::vx::TensorSpec output_spec(tim::vx::DataType::FLOAT32, output_shape,
tim::vx::TensorAttribute::OUTPUT);
std::vector<float> in_data = {1,4,};
std::vector<float> weight = {-3,3,2,1,};
std::vector<float> bias = {0.1, 0.4,};
std::vector<float> golden = {-8, 25};
auto input_tensor = src_graph->CreateTensor(input_spec);
auto weight_tensor = src_graph->CreateTensor(kernel_spec, weight.data());
auto bias_tensor = src_graph->CreateTensor(bias_spec, bias.data());
auto tran_tensor = src_graph->CreateTensor(tran_spec);
auto output_tensor = src_graph->CreateTensor(output_spec);
auto op1 = src_graph->CreateOperation<tim::vx::ops::FullyConnected>(0,2);
(*op1).BindInputs({input_tensor, weight_tensor, bias_tensor}).BindOutputs({tran_tensor});
auto op2 = src_graph->CreateOperation<tim::vx::ops::FullyConnected>(0,2);
(*op2).BindInputs({tran_tensor, weight_tensor, bias_tensor}).BindOutputs({output_tensor});
// Do layout inference
auto transform = tim::transform::LayoutInference(src_graph, ctx);
auto infer_graph = transform.first;
auto graph_io_map = transform.second;
infer_graph->Compile();
auto infer_input = graph_io_map[src_graph->InputsTensor()[0]];
auto infer_output = graph_io_map[src_graph->OutputsTensor()[0]];
infer_input->CopyDataToTensor(in_data.data(), in_data.size() * sizeof(float));
infer_graph->Run();
std::vector<float> output(golden.size()); std::vector<float> output(golden.size());
EXPECT_TRUE(infer_output->CopyDataFromTensor(output.data())); EXPECT_TRUE(infer_output->CopyDataFromTensor(output.data()));
EXPECT_EQ(golden, output); EXPECT_EQ(golden, output);

View File

@ -24,6 +24,10 @@
#include "tim/vx/graph.h" #include "tim/vx/graph.h"
#include <algorithm> #include <algorithm>
#ifdef ENABLE_TENSOR_CACHE
#include <openssl/evp.h>
#endif
#include "context_private.h" #include "context_private.h"
#include "graph_private.h" #include "graph_private.h"
#include "op_impl.h" #include "op_impl.h"
@ -55,6 +59,96 @@ GraphImpl::GraphImpl(ContextImpl* context, const CompileOption& options)
GraphImpl::~GraphImpl() { vsi_nn_ReleaseGraph(&graph_); } GraphImpl::~GraphImpl() { vsi_nn_ReleaseGraph(&graph_); }
#ifdef ENABLE_TENSOR_CACHE
std::map<std::string, std::shared_ptr<tim::vx::Tensor>>& GraphImpl::GetTensorCacheMap() {
return cached_tensor_;
}
#define MD5_SECRET_LEN_16 (16)
#define MD5_BYTE_STRING_LEN (4)
const std::string GraphImpl::caclulateMd5Secret32(const std::string& src) {
std::string md5String;
EVP_MD_CTX *mdctx;
const EVP_MD *md;
uint32_t md_len;
unsigned char md_value[MD5_SECRET_LEN_16] = {0};
char tmp[MD5_BYTE_STRING_LEN] = {0};
OpenSSL_add_all_digests();
md = EVP_md5();
if (md == NULL) {
VSILOGE("Unknown EVP_md5 message.");
}
mdctx = EVP_MD_CTX_new();
if (!EVP_DigestInit_ex(mdctx, md, NULL)) {
VSILOGE("EVP_MD_CTX initialization failed.");
EVP_MD_CTX_free(mdctx);
}
if (!EVP_DigestUpdate(mdctx, src.c_str(), src.size())) {
VSILOGE("EVP_MD_CTX update failed.");
EVP_MD_CTX_free(mdctx);
}
if (!EVP_DigestFinal_ex(mdctx, md_value, &md_len)) {
VSILOGE("EVP_MD_CTX finalization failed.");
EVP_MD_CTX_free(mdctx);
}
EVP_MD_CTX_free(mdctx);
for (int i = 0; i < MD5_SECRET_LEN_16; ++i) {
memset(tmp, 0x00, sizeof(tmp));
snprintf(tmp, sizeof(tmp), "%02X", md_value[i]);
md5String += tmp;
}
return md5String;
}
const std::string GraphImpl::CaclulateCacheKey(const TensorSpec& spec, const void* data) {
std::string md5_key;
uint32_t data_size = 1;
for (auto it = spec.shape_.begin(); it != spec.shape_.end(); ++it) {
data_size *= *it;
}
switch (spec.datatype_) {
case DataType::INT16:
case DataType::UINT16:
case DataType::FLOAT16:
data_size *= 2;
break;
case DataType::INT32:
case DataType::UINT32:
case DataType::FLOAT32:
data_size *= 4;
break;
case DataType::INT64:
data_size *= 8;
break;
default:
break;
}
if (data_size < 512) {
md5_key = caclulateMd5Secret32(std::string((const char*)data, data_size));
} else {
md5_key = caclulateMd5Secret32(
std::string((const char*)data, 512)); //Take first 512 bytes
}
return md5_key;
}
std::shared_ptr<Tensor> GraphImpl::GetTensorFromCache(const TensorSpec& spec, const void* data) {
std::shared_ptr<tim::vx::Tensor> tensor;
std::string md5_key = CaclulateCacheKey(spec, data);
if (GetTensorCacheMap().find(md5_key) != GetTensorCacheMap().end() &&
GetTensorCacheMap()[md5_key]->GetQuantization().Scales() == spec.quantization_.Scales() &&
GetTensorCacheMap()[md5_key]->GetQuantization().ZeroPoints() == spec.quantization_.ZeroPoints()) {
tensor = GetTensorCacheMap()[md5_key];
} else {
tensor = std::make_shared<TensorImpl>(this, spec, data);
GetTensorCacheMap()[md5_key] = tensor;
}
return tensor;
}
#endif
vsi_nn_graph_t* GraphImpl::graph() { return graph_; } vsi_nn_graph_t* GraphImpl::graph() { return graph_; }
void GraphImpl::AddInput(vsi_nn_tensor_id_t id) { void GraphImpl::AddInput(vsi_nn_tensor_id_t id) {
@ -135,6 +229,11 @@ void GraphImpl::PrintGraph() const { vsi_nn_PrintGraph(this->graph_); }
std::shared_ptr<Tensor> GraphImpl::CreateTensor(const TensorSpec& spec, std::shared_ptr<Tensor> GraphImpl::CreateTensor(const TensorSpec& spec,
const void* data) { const void* data) {
#ifdef ENABLE_TENSOR_CACHE
if (spec.attr_ & TensorAttribute::CONSTANT && data != NULL) {
return GetTensorFromCache(spec, data);
}
#endif
auto tensor = std::make_shared<TensorImpl>(this, spec, data); auto tensor = std::make_shared<TensorImpl>(this, spec, data);
if (spec.attr_ & TensorAttribute::INPUT) { if (spec.attr_ & TensorAttribute::INPUT) {
this->AddInput(tensor); this->AddInput(tensor);

View File

@ -26,6 +26,7 @@
#include "tim/vx/graph.h" #include "tim/vx/graph.h"
#include <vector> #include <vector>
#include <string>
#include <mutex> #include <mutex>
#include <utility> #include <utility>
#include <map> #include <map>
@ -43,7 +44,12 @@ class GraphImpl : public Graph {
public: public:
GraphImpl(ContextImpl* context, const CompileOption& options = CompileOption::DefaultOptions); GraphImpl(ContextImpl* context, const CompileOption& options = CompileOption::DefaultOptions);
~GraphImpl(); ~GraphImpl();
#ifdef ENABLE_TENSOR_CACHE
std::shared_ptr<Tensor> GetTensorFromCache(const TensorSpec& spec, const void* data);
const std::string CaclulateCacheKey(const TensorSpec& spec, const void* data);
const std::string caclulateMd5Secret32(const std::string& src);
std::map<std::string, std::shared_ptr<tim::vx::Tensor>>& GetTensorCacheMap();
#endif
/// Return the low-level graph object /// Return the low-level graph object
vsi_nn_graph_t* graph(); vsi_nn_graph_t* graph();
void AddInput(vsi_nn_tensor_id_t id); void AddInput(vsi_nn_tensor_id_t id);
@ -97,7 +103,9 @@ class GraphImpl : public Graph {
int32_t not_consumed_output_cnt_; int32_t not_consumed_output_cnt_;
std::map<std::shared_ptr<Tensor>, std::vector<std::shared_ptr<Operation>>> tensor_consumers_; std::map<std::shared_ptr<Tensor>, std::vector<std::shared_ptr<Operation>>> tensor_consumers_;
std::map<std::shared_ptr<Tensor>, std::shared_ptr<Operation>> tensor_producer_; std::map<std::shared_ptr<Tensor>, std::shared_ptr<Operation>> tensor_producer_;
#ifdef ENABLE_TENSOR_CACHE
std::map<std::string, std::shared_ptr<tim::vx::Tensor>> cached_tensor_;
#endif
CompileOption options_; CompileOption options_;
private: private:
/// Setup graph /// Setup graph