|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one |
| 3 | + * or more contributor license agreements. See the NOTICE file |
| 4 | + * distributed with this work for additional information |
| 5 | + * regarding copyright ownership. The ASF licenses this file |
| 6 | + * to you under the Apache License, Version 2.0 (the |
| 7 | + * "License"); you may not use this file except in compliance |
| 8 | + * with the License. You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, software |
| 13 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 14 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | + * See the License for the specific language governing permissions and |
| 16 | + * limitations under the License. |
| 17 | + */ |
| 18 | + |
| 19 | +package org.apache.iceberg.mr.hive.actions; |
| 20 | + |
| 21 | +import java.io.IOException; |
| 22 | +import java.util.List; |
| 23 | +import java.util.concurrent.ExecutionException; |
| 24 | +import java.util.concurrent.ExecutorService; |
| 25 | +import java.util.stream.StreamSupport; |
| 26 | +import org.apache.hadoop.hive.ql.ddl.misc.msck.MsckDesc; |
| 27 | +import org.apache.hadoop.hive.ql.ddl.misc.msck.MsckResult; |
| 28 | +import org.apache.iceberg.DeleteFiles; |
| 29 | +import org.apache.iceberg.FileScanTask; |
| 30 | +import org.apache.iceberg.Table; |
| 31 | +import org.apache.iceberg.Transaction; |
| 32 | +import org.apache.iceberg.io.CloseableIterable; |
| 33 | +import org.apache.iceberg.mr.hive.IcebergTableUtil; |
| 34 | +import org.slf4j.Logger; |
| 35 | +import org.slf4j.LoggerFactory; |
| 36 | + |
| 37 | +/** |
| 38 | + * Repairs an Iceberg table by removing dangling file references. |
| 39 | + * <p> |
| 40 | + * Detects and removes references to data files that are missing from the filesystem |
| 41 | + * but still referenced in metadata. Supports dry-run mode and parallel execution. |
| 42 | + */ |
| 43 | +public class HiveIcebergRepairTable { |
| 44 | + |
| 45 | + private static final Logger LOG = LoggerFactory.getLogger(HiveIcebergRepairTable.class); |
| 46 | + private static final int DEFAULT_NUM_THREADS = 4; |
| 47 | + |
| 48 | + private final Table table; |
| 49 | + private final MsckDesc desc; |
| 50 | + private final int numThreads; |
| 51 | + |
| 52 | + public HiveIcebergRepairTable(Table table, MsckDesc desc) { |
| 53 | + this(table, desc, DEFAULT_NUM_THREADS); |
| 54 | + } |
| 55 | + |
| 56 | + public HiveIcebergRepairTable(Table table, MsckDesc desc, int numThreads) { |
| 57 | + this.table = table; |
| 58 | + this.desc = desc; |
| 59 | + this.numThreads = numThreads; |
| 60 | + } |
| 61 | + |
| 62 | + /** |
| 63 | + * Executes the repair operation within a provided transaction. |
| 64 | + * |
| 65 | + * @param transaction the Iceberg transaction to use |
| 66 | + * @return repair result containing number of issues fixed and log message |
| 67 | + * @throws IOException if metadata validation or file check fails |
| 68 | + */ |
| 69 | + public MsckResult execute(Transaction transaction) throws IOException { |
| 70 | + List<String> missingFiles = getMissingFiles(); |
| 71 | + |
| 72 | + if (missingFiles.isEmpty()) { |
| 73 | + String msg = "No missing files detected"; |
| 74 | + LOG.info(msg); |
| 75 | + return new MsckResult(0, msg, new java.util.ArrayList<>()); |
| 76 | + } else if (!desc.isDryRun()) { |
| 77 | + // Only commit changes if not in dry-run mode |
| 78 | + DeleteFiles deleteFiles = transaction.newDelete(); |
| 79 | + for (String path : missingFiles) { |
| 80 | + deleteFiles.deleteFile(path); |
| 81 | + } |
| 82 | + deleteFiles.commit(); |
| 83 | + } |
| 84 | + |
| 85 | + String summaryMsg = desc.isDryRun() ? |
| 86 | + "Would remove %d dangling file reference(s)".formatted(missingFiles.size()) : |
| 87 | + "Removed %d dangling file reference(s)".formatted(missingFiles.size()); |
| 88 | + LOG.info(summaryMsg); |
| 89 | + |
| 90 | + String detailedMsg = desc.isDryRun() ? |
| 91 | + "Iceberg table repair (dry-run): %s".formatted(summaryMsg) : |
| 92 | + "Iceberg table repair completed: %s".formatted(summaryMsg); |
| 93 | + |
| 94 | + return new MsckResult(missingFiles.size(), detailedMsg, missingFiles); |
| 95 | + } |
| 96 | + |
| 97 | + /** |
| 98 | + * Executes the repair operation, automatically creating and committing a transaction. |
| 99 | + * |
| 100 | + * @return repair result containing removed files and statistics |
| 101 | + * @throws IOException if metadata validation or file check fails |
| 102 | + */ |
| 103 | + public MsckResult execute() throws IOException { |
| 104 | + Transaction transaction = table.newTransaction(); |
| 105 | + MsckResult result = execute(transaction); |
| 106 | + if (!desc.isDryRun() && result.getNumIssues() > 0) { |
| 107 | + transaction.commitTransaction(); |
| 108 | + } |
| 109 | + return result; |
| 110 | + } |
| 111 | + |
| 112 | + /** |
| 113 | + * Finds all missing data files by checking their physical existence in parallel. |
| 114 | + * |
| 115 | + * @return list of file paths for missing data files |
| 116 | + * @throws IOException if the file check operation fails or is interrupted |
| 117 | + */ |
| 118 | + private List<String> getMissingFiles() throws IOException { |
| 119 | + try (ExecutorService executorService = IcebergTableUtil.newFixedThreadPool( |
| 120 | + "repair-table-" + table.name(), numThreads); |
| 121 | + CloseableIterable<FileScanTask> fileScanTasks = table.newScan().planFiles()) { |
| 122 | + return executorService.submit(() -> |
| 123 | + StreamSupport.stream(fileScanTasks.spliterator(), true) |
| 124 | + .map(task -> task.file().location()) |
| 125 | + .filter(path -> !table.io().newInputFile(path).exists()) |
| 126 | + .toList() |
| 127 | + ).get(); |
| 128 | + } catch (InterruptedException e) { |
| 129 | + Thread.currentThread().interrupt(); |
| 130 | + throw new IOException("Interrupted while checking for missing files", e); |
| 131 | + |
| 132 | + } catch (ExecutionException e) { |
| 133 | + throw new IOException("Failed to check for missing files: " + e.getMessage(), e); |
| 134 | + } |
| 135 | + } |
| 136 | +} |
0 commit comments