Skip to content

Commit ba83384

Browse files
author
youxiao
committed
Adapt to adxl connection auto release feature
1 parent 496cecd commit ba83384

File tree

3 files changed

+37
-14
lines changed

3 files changed

+37
-14
lines changed

mooncake-common/common.cmake

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,14 +160,15 @@ if (USE_ASCEND OR USE_ASCEND_DIRECT)
160160
file(GLOB ASCEND_TOOLKIT_ROOT "/usr/local/Ascend/ascend-toolkit/latest/*-linux")
161161
endif()
162162
set(ASCEND_LIB_DIR "${ASCEND_TOOLKIT_ROOT}/lib64")
163-
set(ASCEND_DEVLIB_DIR "${ASCEND_TOOLKIT_ROOT}/devlib")
164163
set(ASCEND_INCLUDE_DIR "${ASCEND_TOOLKIT_ROOT}/include")
165164
add_compile_options(-Wno-ignored-qualifiers)
166165
include_directories(/usr/local/include /usr/include ${ASCEND_INCLUDE_DIR})
167-
link_directories(${ASCEND_LIB_DIR} ${ASCEND_DEVLIB_DIR})
166+
link_directories(${ASCEND_LIB_DIR})
168167
endif()
169168

170169
if (USE_ASCEND)
170+
set(ASCEND_DEVLIB_DIR "${ASCEND_TOOLKIT_ROOT}/devlib")
171+
link_directories(${ASCEND_DEVLIB_DIR})
171172
add_compile_definitions(USE_ASCEND)
172173
endif()
173174

mooncake-transfer-engine/include/transport/ascend_transport/ascend_direct_transport/ascend_direct_transport.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,11 @@ class AscendDirectTransport : public Transport {
8181

8282
void processSliceList(const std::vector<Slice *> &slice_list);
8383

84+
void connectAndTransfer(const std::string &target_adxl_engine_name,
85+
adxl::TransferOp operation,
86+
const std::vector<Slice *> &slice_list,
87+
int32_t times = 0);
88+
8489
void localCopy(TransferRequest::OpCode opcode,
8590
const std::vector<Slice *> &slice_list);
8691

@@ -105,7 +110,7 @@ class AscendDirectTransport : public Transport {
105110
int checkAndConnect(const std::string &target_adxl_engine_name);
106111

107112
int disconnect(const std::string &target_adxl_engine_name,
108-
int32_t timeout_in_millis);
113+
int32_t timeout_in_millis, bool force = false);
109114

110115
std::atomic_bool running_;
111116
std::unique_ptr<adxl::AdxlEngine> adxl_;

mooncake-transfer-engine/src/transport/ascend_transport/ascend_direct_transport/ascend_direct_transport.cpp

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@
3636
namespace mooncake {
3737
namespace {
3838
constexpr size_t kMemcpyBatchLimit = 4096;
39-
}
39+
constexpr int32_t kMaxAdxlConnectRetries = 3;
40+
} // namespace
4041
AscendDirectTransport::AscendDirectTransport() : running_(false) {}
4142

4243
AscendDirectTransport::~AscendDirectTransport() {
@@ -78,6 +79,7 @@ AscendDirectTransport::~AscendDirectTransport() {
7879
}
7980
}
8081
addr_to_mem_handle_.clear();
82+
adxl_->Finalize();
8183
}
8284

8385
int AscendDirectTransport::install(std::string &local_server_name,
@@ -582,10 +584,14 @@ void AscendDirectTransport::processSliceList(
582584
<< "us";
583585
return;
584586
}
587+
return connectAndTransfer(target_adxl_engine_name, operation, slice_list);
588+
}
589+
590+
void AscendDirectTransport::connectAndTransfer(
591+
const std::string &target_adxl_engine_name, adxl::TransferOp operation,
592+
const std::vector<Slice *> &slice_list, int32_t times) {
585593
int ret = checkAndConnect(target_adxl_engine_name);
586594
if (ret != 0) {
587-
LOG(ERROR) << "Failed to connect to segment: "
588-
<< target_segment_desc->name;
589595
for (auto &slice : slice_list) {
590596
slice->markFailed();
591597
}
@@ -613,6 +619,14 @@ void AscendDirectTransport::processSliceList(
613619
std::chrono::steady_clock::now() - start)
614620
.count()
615621
<< " us";
622+
} else if (status == adxl::NOT_CONNECTED) {
623+
LOG(INFO) << "Connection reset by backend, retry times:" << times;
624+
disconnect(target_adxl_engine_name, 0, true);
625+
if (times < kMaxAdxlConnectRetries) {
626+
return connectAndTransfer(target_adxl_engine_name, operation,
627+
slice_list, times + 1);
628+
}
629+
return;
616630
} else {
617631
if (status == adxl::TIMEOUT) {
618632
LOG(ERROR) << "Transfer timeout to: " << target_adxl_engine_name
@@ -844,21 +858,24 @@ int AscendDirectTransport::checkAndConnect(
844858
}
845859

846860
int AscendDirectTransport::disconnect(
847-
const std::string &target_adxl_engine_name, int32_t timeout_in_millis) {
861+
const std::string &target_adxl_engine_name, int32_t timeout_in_millis,
862+
bool force) {
848863
std::lock_guard<std::mutex> lock(connection_mutex_);
849864
auto it = connected_segments_.find(target_adxl_engine_name);
850865
if (it == connected_segments_.end()) {
851866
LOG(INFO) << "Target adxl engine: " << target_adxl_engine_name
852867
<< " is not connected.";
853868
return 0;
854869
}
855-
auto status =
856-
adxl_->Disconnect(target_adxl_engine_name.c_str(), timeout_in_millis);
857-
if (status != adxl::SUCCESS) {
858-
LOG(ERROR) << "Failed to disconnect to: " << target_adxl_engine_name
859-
<< ", status: " << status;
860-
connected_segments_.erase(target_adxl_engine_name);
861-
return -1;
870+
if (!force) {
871+
auto status = adxl_->Disconnect(target_adxl_engine_name.c_str(),
872+
timeout_in_millis);
873+
if (status != adxl::SUCCESS) {
874+
LOG(ERROR) << "Failed to disconnect to: " << target_adxl_engine_name
875+
<< ", status: " << status;
876+
connected_segments_.erase(target_adxl_engine_name);
877+
return -1;
878+
}
862879
}
863880
connected_segments_.erase(target_adxl_engine_name);
864881
return 0;

0 commit comments

Comments
 (0)