Skip to content

Commit

Permalink
Use the apache hive logic to fetch escape delimiter in Hive SerDeOpti…
Browse files Browse the repository at this point in the history
  • Loading branch information
Sergey Pershin authored and facebook-github-bot committed Sep 4, 2024
1 parent cb64585 commit eb055d4
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 8 deletions.
22 changes: 17 additions & 5 deletions velox/connectors/hive/HiveConnectorUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -489,11 +489,23 @@ std::unique_ptr<dwio::common::SerDeOptions> parseSerdeParameters(
mapKeyDelim = parseDelimiter(mapKeyIt->second);
}

uint8_t escapeChar;
bool hasEscapeChar = false;
if (escapeCharIt != serdeParameters.end() && !escapeCharIt->second.empty()) {
hasEscapeChar = true;
escapeChar = escapeCharIt->second[0];
// If escape character is specified then we use it, unless it is empty - in
// which case we default to '\\'.
// If escape character is not specified (not in the map) we turn escaping off.
// Logic is based on apache hive java code:
// https://github.com/apache/hive/blob/3f6f940af3f60cc28834268e5d7f5612e3b13c30/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySerDeParameters.java#L105-L108
uint8_t escapeChar = '\\';
const bool hasEscapeChar = (escapeCharIt != serdeParameters.end());
if (hasEscapeChar) {
if (!escapeCharIt->second.empty()) {
// If delim is convertible to uint8_t then we use it as character code,
// otherwise we use the 1st character of the string.
try {
escapeChar = folly::to<uint8_t>(escapeCharIt->second);
} catch (const std::exception&) {
escapeChar = escapeCharIt->second[0];
}
}
}

auto serDeOptions = hasEscapeChar
Expand Down
38 changes: 35 additions & 3 deletions velox/connectors/hive/tests/HiveConnectorUtilTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,38 @@ TEST_F(HiveConnectorUtilTest, configureReaderOptions) {
performConfigure();
EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe));

// Empty escape delim means default escape char.
clearDynamicParameters(FileFormat::TEXT);
serdeParameters[SerDeOptions::kEscapeChar] = "";
expectedSerDe.escapeChar = '\\';
expectedSerDe.isEscaped = true;
performConfigure();
EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe));

// Convertible to byte escape char - use it.
clearDynamicParameters(FileFormat::TEXT);
serdeParameters[SerDeOptions::kEscapeChar] = "38";
expectedSerDe.escapeChar = '&';
expectedSerDe.isEscaped = true;
performConfigure();
EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe));

// Overflow byte escape char - fall back to the 1st character of the string.
clearDynamicParameters(FileFormat::TEXT);
serdeParameters[SerDeOptions::kEscapeChar] = "381";
expectedSerDe.escapeChar = '3';
expectedSerDe.isEscaped = true;
performConfigure();
EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe));

// Not convertible string - fall back to the 1st character of the string.
clearDynamicParameters(FileFormat::TEXT);
serdeParameters[SerDeOptions::kEscapeChar] = "7!";
expectedSerDe.escapeChar = '7';
expectedSerDe.isEscaped = true;
performConfigure();
EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe));

// Modify all previous together.
clearDynamicParameters(FileFormat::TEXT);
serdeParameters[SerDeOptions::kFieldDelim] = '~';
Expand All @@ -167,13 +199,13 @@ TEST_F(HiveConnectorUtilTest, configureReaderOptions) {
expectedSerDe.separators[size_t(SerDeSeparator::COLLECTION_DELIM)] = '$';
serdeParameters[SerDeOptions::kMapKeyDelim] = '*';
expectedSerDe.separators[size_t(SerDeSeparator::MAP_KEY_DELIM)] = '*';
serdeParameters[SerDeOptions::kEscapeChar] = '*';
expectedSerDe.escapeChar = '*';
expectedSerDe.isEscaped = true;
tableParameters[TableParameter::kSerializationNullFormat] = "";
expectedSerDe.nullString = "";
performConfigure();
EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe));
EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe));
EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe));
EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe));

// Tests other custom reader options.
clearDynamicParameters(FileFormat::TEXT);
Expand Down

0 comments on commit eb055d4

Please sign in to comment.