diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/repl/CopyUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/repl/CopyUtils.java index 2557121f2d..5090239315 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/repl/CopyUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/repl/CopyUtils.java @@ -33,6 +33,7 @@ import org.slf4j.LoggerFactory; import javax.security.auth.login.LoginException; +import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; import java.security.PrivilegedExceptionAction; @@ -93,7 +94,7 @@ private void doCopyRetry(FileSystem sourceFs, List s FileSystem destinationFs, Path destination, boolean useRegularCopy) throws IOException, LoginException { int repeat = 0; - boolean isCopyError = false; + boolean isCopyError; List pathList = Lists.transform(srcFileList, ReplChangeManager.FileInfo::getEffectivePath); while (!pathList.isEmpty() && (repeat < MAX_COPY_RETRY)) { LOG.info("Attempt: " + (repeat+1) + ". Copying files: " + pathList); @@ -102,6 +103,12 @@ private void doCopyRetry(FileSystem sourceFs, List s doCopyOnce(sourceFs, pathList, destinationFs, destination, useRegularCopy); } catch (IOException e) { // If copy fails, fall through the retry logic + if (!(e instanceof FileNotFoundException)) { + LOG.info(" File operation failed with error : " + e.getMessage()); + // looks like some network outrage, reset the file system object and retry. + sourceFs = pathList.get(0).getFileSystem(hiveConf); + destinationFs = destination.getFileSystem(hiveConf); + } isCopyError = true; } pathList = getFilesToRetry(sourceFs, srcFileList, destinationFs, destination, isCopyError); @@ -156,16 +163,11 @@ private void doCopyRetry(FileSystem sourceFs, List s srcFile.setIsUseSourcePath(false); } } else { + // The file copy might have failed for network issue. So retry with cm path again if the + // original file is missing. if (srcFile.isUseSourcePath()) { // Source file missing, then try with CM path srcFile.setIsUseSourcePath(false); - } else { - // CM path itself is missing, cannot recover from this error - LOG.error("File Copy Failed. Both source and CM files are missing from source. " - + "Missing Source File: " + srcFile.getSourcePath() + ", CM File: " + srcFile.getCmPath() + ". " - + "Try setting higher value for hive.repl.cm.retain in source warehouse. " - + "Also, bootstrap the system again to get back the consistent replicated state."); - throw new IOException("Both source and CM path are missing from source."); } } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/repl/dump/events/InsertHandler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/repl/dump/events/InsertHandler.java index 5ac3af0c30..f9dda0187f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/repl/dump/events/InsertHandler.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/repl/dump/events/InsertHandler.java @@ -26,6 +26,7 @@ import org.apache.hadoop.hive.ql.parse.EximUtil; import org.apache.hadoop.hive.ql.parse.repl.DumpType; import org.apache.hadoop.hive.ql.parse.repl.dump.Utils; +import org.apache.hadoop.hive.ql.parse.repl.dump.io.FileOperations; import org.apache.hadoop.hive.ql.parse.repl.load.DumpMetaData; import java.io.BufferedWriter; @@ -81,10 +82,22 @@ public void handle(Context withinContext) throws Exception { dataPath = new Path(withinContext.eventRoot, qlPtns.get(0).getName()); } - // encoded filename/checksum of files, write into _files - try (BufferedWriter fileListWriter = writer(withinContext, dataPath)) { - for (String file : files) { - fileListWriter.write(file + "\n"); + int repeat = 0; + boolean done = false; + + while (!done) { + // encoded filename/checksum of files, write into _files + try (BufferedWriter fileListWriter = writer(withinContext, dataPath)) { + for (String file : files) { + fileListWriter.write(file + "\n"); + } + done = true; + } catch (IOException e) { + // For any error, start the file write again from starting as we don't know if the file is corrupted or not. + LOG.info("File write failed with error {}. Num retry {}", e.getMessage(), repeat++); + if (repeat == FileOperations.MAX_IO_ERROR_RETRY) { + throw e; + } } } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/repl/dump/io/FileOperations.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/repl/dump/io/FileOperations.java index 085f4a1cb1..a81899696b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/repl/dump/io/FileOperations.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/repl/dump/io/FileOperations.java @@ -53,8 +53,10 @@ private final Path exportRootDataDir; private final String distCpDoAsUser; private HiveConf hiveConf; - private final FileSystem dataFileSystem, exportFileSystem; + private final FileSystem exportFileSystem; + private FileSystem dataFileSystem; private final MmContext mmCtx; + public static final int MAX_IO_ERROR_RETRY = 5; public FileOperations(List dataPathList, Path exportRootDataDir, String distCpDoAsUser, HiveConf hiveConf, MmContext mmCtx) throws IOException { @@ -167,10 +169,24 @@ private void writeFilesList(FileStatus[] fileStatuses, BufferedWriter writer, St } private FileStatus[] listFilesInDir(Path path) throws IOException { - return dataFileSystem.listStatus(path, p -> { - String name = p.getName(); - return !name.startsWith("_") && !name.startsWith("."); - }); + boolean done = false; + int repeat = 0; + FileStatus[] files = null; + while (!done) { + try { + files = dataFileSystem.listStatus(path, p -> { + String name = p.getName(); + return !name.startsWith("_") && !name.startsWith("."); + }); + } catch (IOException e) { + if (repeat++ == MAX_IO_ERROR_RETRY) { + throw e; + } + // in case of ip error, reset the file system object + dataFileSystem = path.getFileSystem(hiveConf); + } + } + return files; } private BufferedWriter writer() throws IOException {