001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.mapreduce; 019 020import static org.apache.hadoop.hbase.regionserver.HStoreFile.BULKLOAD_TASK_KEY; 021import static org.apache.hadoop.hbase.regionserver.HStoreFile.BULKLOAD_TIME_KEY; 022import static org.apache.hadoop.hbase.regionserver.HStoreFile.EXCLUDE_FROM_MINOR_COMPACTION_KEY; 023import static org.apache.hadoop.hbase.regionserver.HStoreFile.MAJOR_COMPACTION_KEY; 024 025import java.io.IOException; 026import java.io.UnsupportedEncodingException; 027import java.net.InetSocketAddress; 028import java.net.URLDecoder; 029import java.net.URLEncoder; 030import java.nio.charset.Charset; 031import java.util.ArrayList; 032import java.util.Arrays; 033import java.util.Collections; 034import java.util.List; 035import java.util.Map; 036import java.util.Map.Entry; 037import java.util.Set; 038import java.util.TreeMap; 039import java.util.TreeSet; 040import java.util.UUID; 041import java.util.function.Function; 042import java.util.stream.Collectors; 043import org.apache.commons.lang3.StringUtils; 044import org.apache.hadoop.conf.Configuration; 045import org.apache.hadoop.fs.FileSystem; 046import org.apache.hadoop.fs.Path; 047import org.apache.hadoop.hbase.Cell; 048import org.apache.hadoop.hbase.CellUtil; 049import org.apache.hadoop.hbase.HConstants; 050import org.apache.hadoop.hbase.HRegionLocation; 051import org.apache.hadoop.hbase.HTableDescriptor; 052import org.apache.hadoop.hbase.KeyValue; 053import org.apache.hadoop.hbase.KeyValueUtil; 054import org.apache.hadoop.hbase.PrivateCellUtil; 055import org.apache.hadoop.hbase.TableName; 056import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; 057import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; 058import org.apache.hadoop.hbase.client.Connection; 059import org.apache.hadoop.hbase.client.ConnectionFactory; 060import org.apache.hadoop.hbase.client.Put; 061import org.apache.hadoop.hbase.client.RegionLocator; 062import org.apache.hadoop.hbase.client.Table; 063import org.apache.hadoop.hbase.client.TableDescriptor; 064import org.apache.hadoop.hbase.fs.HFileSystem; 065import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 066import org.apache.hadoop.hbase.io.compress.Compression; 067import org.apache.hadoop.hbase.io.compress.Compression.Algorithm; 068import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding; 069import org.apache.hadoop.hbase.io.hfile.CacheConfig; 070import org.apache.hadoop.hbase.io.hfile.HFile; 071import org.apache.hadoop.hbase.io.hfile.HFileContext; 072import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder; 073import org.apache.hadoop.hbase.io.hfile.HFileWriterImpl; 074import org.apache.hadoop.hbase.regionserver.BloomType; 075import org.apache.hadoop.hbase.regionserver.HStore; 076import org.apache.hadoop.hbase.regionserver.StoreFileWriter; 077import org.apache.hadoop.hbase.regionserver.StoreUtils; 078import org.apache.hadoop.hbase.util.BloomFilterUtil; 079import org.apache.hadoop.hbase.util.Bytes; 080import org.apache.hadoop.hbase.util.CommonFSUtils; 081import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 082import org.apache.hadoop.hbase.util.MapReduceExtendedCell; 083import org.apache.hadoop.hbase.util.ReflectionUtils; 084import org.apache.hadoop.io.NullWritable; 085import org.apache.hadoop.io.SequenceFile; 086import org.apache.hadoop.io.Text; 087import org.apache.hadoop.io.Writable; 088import org.apache.hadoop.mapreduce.Job; 089import org.apache.hadoop.mapreduce.OutputCommitter; 090import org.apache.hadoop.mapreduce.OutputFormat; 091import org.apache.hadoop.mapreduce.RecordWriter; 092import org.apache.hadoop.mapreduce.TaskAttemptContext; 093import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 094import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner; 095import org.apache.yetus.audience.InterfaceAudience; 096import org.slf4j.Logger; 097import org.slf4j.LoggerFactory; 098 099/** 100 * Writes HFiles. Passed Cells must arrive in order. Writes current time as the sequence id for the 101 * file. Sets the major compacted attribute on created {@link HFile}s. Calling write(null,null) will 102 * forcibly roll all HFiles being written. 103 * <p> 104 * Using this class as part of a MapReduce job is best done using 105 * {@link #configureIncrementalLoad(Job, TableDescriptor, RegionLocator)}. 106 */ 107@InterfaceAudience.Public 108public class HFileOutputFormat2 extends FileOutputFormat<ImmutableBytesWritable, Cell> { 109 private static final Logger LOG = LoggerFactory.getLogger(HFileOutputFormat2.class); 110 111 static class TableInfo { 112 private TableDescriptor tableDesctiptor; 113 private RegionLocator regionLocator; 114 115 public TableInfo(TableDescriptor tableDesctiptor, RegionLocator regionLocator) { 116 this.tableDesctiptor = tableDesctiptor; 117 this.regionLocator = regionLocator; 118 } 119 120 /** 121 * The modification for the returned HTD doesn't affect the inner TD. 122 * @return A clone of inner table descriptor 123 * @deprecated since 2.0.0 and will be removed in 3.0.0. Use {@link #getTableDescriptor()} 124 * instead. 125 * @see #getTableDescriptor() 126 * @see <a href="https://issues.apache.org/jira/browse/HBASE-18241">HBASE-18241</a> 127 */ 128 @Deprecated 129 public HTableDescriptor getHTableDescriptor() { 130 return new HTableDescriptor(tableDesctiptor); 131 } 132 133 public TableDescriptor getTableDescriptor() { 134 return tableDesctiptor; 135 } 136 137 public RegionLocator getRegionLocator() { 138 return regionLocator; 139 } 140 } 141 142 protected static final byte[] tableSeparator = Bytes.toBytes(";"); 143 144 protected static byte[] combineTableNameSuffix(byte[] tableName, byte[] suffix) { 145 return Bytes.add(tableName, tableSeparator, suffix); 146 } 147 148 // The following constants are private since these are used by 149 // HFileOutputFormat2 to internally transfer data between job setup and 150 // reducer run using conf. 151 // These should not be changed by the client. 152 static final String COMPRESSION_FAMILIES_CONF_KEY = 153 "hbase.hfileoutputformat.families.compression"; 154 static final String BLOOM_TYPE_FAMILIES_CONF_KEY = "hbase.hfileoutputformat.families.bloomtype"; 155 static final String BLOOM_PARAM_FAMILIES_CONF_KEY = "hbase.hfileoutputformat.families.bloomparam"; 156 static final String BLOCK_SIZE_FAMILIES_CONF_KEY = "hbase.mapreduce.hfileoutputformat.blocksize"; 157 static final String DATABLOCK_ENCODING_FAMILIES_CONF_KEY = 158 "hbase.mapreduce.hfileoutputformat.families.datablock.encoding"; 159 160 // When MULTI_TABLE_HFILEOUTPUTFORMAT_CONF_KEY is enabled, should table names be written 161 // with namespace included. Enabling this means downstream jobs which use this output will 162 // need to account for namespace when finding the directory of the job output. 163 // For example: a table named my-table in namespace default would be in `/output/default/my-table` 164 // instead of current `/output/my-table` 165 // This will be the behavior when upgrading to hbase 3.0. 166 public static final String TABLE_NAME_WITH_NAMESPACE_INCLUSIVE_KEY = 167 "hbase.hfileoutputformat.tablename.namespace.inclusive"; 168 169 private static final boolean TABLE_NAME_WITH_NAMESPACE_INCLUSIVE_DEFAULT_VALUE = false; 170 171 // This constant is public since the client can modify this when setting 172 // up their conf object and thus refer to this symbol. 173 // It is present for backwards compatibility reasons. Use it only to 174 // override the auto-detection of datablock encoding and compression. 175 public static final String DATABLOCK_ENCODING_OVERRIDE_CONF_KEY = 176 "hbase.mapreduce.hfileoutputformat.datablock.encoding"; 177 public static final String COMPRESSION_OVERRIDE_CONF_KEY = 178 "hbase.mapreduce.hfileoutputformat.compression"; 179 180 /** 181 * Keep locality while generating HFiles for bulkload. See HBASE-12596 182 */ 183 public static final String LOCALITY_SENSITIVE_CONF_KEY = 184 "hbase.bulkload.locality.sensitive.enabled"; 185 private static final boolean DEFAULT_LOCALITY_SENSITIVE = true; 186 static final String OUTPUT_TABLE_NAME_CONF_KEY = "hbase.mapreduce.hfileoutputformat.table.name"; 187 static final String MULTI_TABLE_HFILEOUTPUTFORMAT_CONF_KEY = 188 "hbase.mapreduce.use.multi.table.hfileoutputformat"; 189 190 /** 191 * ExtendedCell and ExtendedCellSerialization are InterfaceAudience.Private. We expose this config 192 * for internal usage in jobs like WALPlayer which need to use features of ExtendedCell. 193 */ 194 @InterfaceAudience.Private 195 public static final String EXTENDED_CELL_SERIALIZATION_ENABLED_KEY = 196 "hbase.mapreduce.hfileoutputformat.extendedcell.enabled"; 197 static final boolean EXTENDED_CELL_SERIALIZATION_ENABLED_DEFULT = false; 198 199 @InterfaceAudience.Private 200 public static final String DISK_BASED_SORTING_ENABLED_KEY = 201 "hbase.mapreduce.hfileoutputformat.disk.based.sorting.enabled"; 202 private static final boolean DISK_BASED_SORTING_ENABLED_DEFAULT = false; 203 204 public static final String REMOTE_CLUSTER_CONF_PREFIX = "hbase.hfileoutputformat.remote.cluster."; 205 public static final String REMOTE_CLUSTER_ZOOKEEPER_QUORUM_CONF_KEY = 206 REMOTE_CLUSTER_CONF_PREFIX + "zookeeper.quorum"; 207 public static final String REMOTE_CLUSTER_ZOOKEEPER_CLIENT_PORT_CONF_KEY = 208 REMOTE_CLUSTER_CONF_PREFIX + "zookeeper." + HConstants.CLIENT_PORT_STR; 209 public static final String REMOTE_CLUSTER_ZOOKEEPER_ZNODE_PARENT_CONF_KEY = 210 REMOTE_CLUSTER_CONF_PREFIX + HConstants.ZOOKEEPER_ZNODE_PARENT; 211 212 public static final String STORAGE_POLICY_PROPERTY = HStore.BLOCK_STORAGE_POLICY_KEY; 213 public static final String STORAGE_POLICY_PROPERTY_CF_PREFIX = STORAGE_POLICY_PROPERTY + "."; 214 215 @Override 216 public RecordWriter<ImmutableBytesWritable, Cell> 217 getRecordWriter(final TaskAttemptContext context) throws IOException, InterruptedException { 218 return createRecordWriter(context, this.getOutputCommitter(context)); 219 } 220 221 protected static byte[] getTableNameSuffixedWithFamily(byte[] tableName, byte[] family) { 222 return combineTableNameSuffix(tableName, family); 223 } 224 225 protected static Path getWorkPath(final OutputCommitter committer) { 226 return (Path) ReflectionUtils.invokeMethod(committer, "getWorkPath"); 227 } 228 229 static <V extends Cell> RecordWriter<ImmutableBytesWritable, V> createRecordWriter( 230 final TaskAttemptContext context, final OutputCommitter committer) throws IOException { 231 232 // Get the path of the temporary output file 233 final Path outputDir = getWorkPath(committer); 234 final Configuration conf = context.getConfiguration(); 235 final boolean writeMultipleTables = 236 conf.getBoolean(MULTI_TABLE_HFILEOUTPUTFORMAT_CONF_KEY, false); 237 final boolean writeToTableWithNamespace = conf.getBoolean( 238 TABLE_NAME_WITH_NAMESPACE_INCLUSIVE_KEY, TABLE_NAME_WITH_NAMESPACE_INCLUSIVE_DEFAULT_VALUE); 239 final String writeTableNames = conf.get(OUTPUT_TABLE_NAME_CONF_KEY); 240 if (writeTableNames == null || writeTableNames.isEmpty()) { 241 throw new IllegalArgumentException("" + OUTPUT_TABLE_NAME_CONF_KEY + " cannot be empty"); 242 } 243 final FileSystem fs = outputDir.getFileSystem(conf); 244 // These configs. are from hbase-*.xml 245 final long maxsize = 246 conf.getLong(HConstants.HREGION_MAX_FILESIZE, HConstants.DEFAULT_MAX_FILE_SIZE); 247 // Invented config. Add to hbase-*.xml if other than default compression. 248 final String defaultCompressionStr = 249 conf.get("hfile.compression", Compression.Algorithm.NONE.getName()); 250 final Algorithm defaultCompression = HFileWriterImpl.compressionByName(defaultCompressionStr); 251 String compressionStr = conf.get(COMPRESSION_OVERRIDE_CONF_KEY); 252 final Algorithm overriddenCompression = 253 compressionStr != null ? Compression.getCompressionAlgorithmByName(compressionStr) : null; 254 final boolean compactionExclude = 255 conf.getBoolean("hbase.mapreduce.hfileoutputformat.compaction.exclude", false); 256 final Set<String> allTableNames = Arrays 257 .stream(writeTableNames.split(Bytes.toString(tableSeparator))).collect(Collectors.toSet()); 258 259 // create a map from column family to the compression algorithm 260 final Map<byte[], Algorithm> compressionMap = createFamilyCompressionMap(conf); 261 final Map<byte[], BloomType> bloomTypeMap = createFamilyBloomTypeMap(conf); 262 final Map<byte[], String> bloomParamMap = createFamilyBloomParamMap(conf); 263 final Map<byte[], Integer> blockSizeMap = createFamilyBlockSizeMap(conf); 264 265 String dataBlockEncodingStr = conf.get(DATABLOCK_ENCODING_OVERRIDE_CONF_KEY); 266 final Map<byte[], DataBlockEncoding> datablockEncodingMap = 267 createFamilyDataBlockEncodingMap(conf); 268 final DataBlockEncoding overriddenEncoding = 269 dataBlockEncodingStr != null ? DataBlockEncoding.valueOf(dataBlockEncodingStr) : null; 270 271 return new RecordWriter<ImmutableBytesWritable, V>() { 272 // Map of families to writers and how much has been output on the writer. 273 private final Map<byte[], WriterLength> writers = new TreeMap<>(Bytes.BYTES_COMPARATOR); 274 private final Map<byte[], byte[]> previousRows = new TreeMap<>(Bytes.BYTES_COMPARATOR); 275 private final long now = EnvironmentEdgeManager.currentTime(); 276 private byte[] tableNameBytes = writeMultipleTables ? null : Bytes.toBytes(writeTableNames); 277 278 @Override 279 public void write(ImmutableBytesWritable row, V cell) throws IOException { 280 Cell kv = cell; 281 // null input == user explicitly wants to flush 282 if (row == null && kv == null) { 283 rollWriters(null); 284 return; 285 } 286 287 byte[] rowKey = CellUtil.cloneRow(kv); 288 int length = (PrivateCellUtil.estimatedSerializedSizeOf(kv)) - Bytes.SIZEOF_INT; 289 byte[] family = CellUtil.cloneFamily(kv); 290 if (writeMultipleTables) { 291 tableNameBytes = MultiTableHFileOutputFormat.getTableName(row.get()); 292 tableNameBytes = writeToTableWithNamespace 293 ? TableName.valueOf(tableNameBytes).getNameWithNamespaceInclAsString() 294 .getBytes(Charset.defaultCharset()) 295 : TableName.valueOf(tableNameBytes).toBytes(); 296 if (!allTableNames.contains(Bytes.toString(tableNameBytes))) { 297 throw new IllegalArgumentException( 298 "TableName " + Bytes.toString(tableNameBytes) + " not expected"); 299 } 300 } 301 byte[] tableAndFamily = getTableNameSuffixedWithFamily(tableNameBytes, family); 302 303 WriterLength wl = this.writers.get(tableAndFamily); 304 305 // If this is a new column family, verify that the directory exists 306 if (wl == null) { 307 Path writerPath = null; 308 if (writeMultipleTables) { 309 Path tableRelPath = getTableRelativePath(tableNameBytes); 310 writerPath = new Path(outputDir, new Path(tableRelPath, Bytes.toString(family))); 311 } else { 312 writerPath = new Path(outputDir, Bytes.toString(family)); 313 } 314 fs.mkdirs(writerPath); 315 configureStoragePolicy(conf, fs, tableAndFamily, writerPath); 316 } 317 318 // This can only happen once a row is finished though 319 if ( 320 wl != null && wl.written + length >= maxsize 321 && Bytes.compareTo(this.previousRows.get(family), rowKey) != 0 322 ) { 323 rollWriters(wl); 324 } 325 326 // create a new WAL writer, if necessary 327 if (wl == null || wl.writer == null) { 328 InetSocketAddress[] favoredNodes = null; 329 if (conf.getBoolean(LOCALITY_SENSITIVE_CONF_KEY, DEFAULT_LOCALITY_SENSITIVE)) { 330 HRegionLocation loc = null; 331 String tableName = Bytes.toString(tableNameBytes); 332 if (tableName != null) { 333 try ( 334 Connection connection = 335 ConnectionFactory.createConnection(createRemoteClusterConf(conf)); 336 RegionLocator locator = connection.getRegionLocator(TableName.valueOf(tableName))) { 337 loc = locator.getRegionLocation(rowKey); 338 } catch (Throwable e) { 339 LOG.warn("Something wrong locating rowkey {} in {}", Bytes.toString(rowKey), 340 tableName, e); 341 loc = null; 342 } 343 } 344 if (null == loc) { 345 LOG.trace("Failed get of location, use default writer {}", Bytes.toString(rowKey)); 346 } else { 347 LOG.debug("First rowkey: [{}]", Bytes.toString(rowKey)); 348 InetSocketAddress initialIsa = 349 new InetSocketAddress(loc.getHostname(), loc.getPort()); 350 if (initialIsa.isUnresolved()) { 351 LOG.trace("Failed resolve address {}, use default writer", loc.getHostnamePort()); 352 } else { 353 LOG.debug("Use favored nodes writer: {}", initialIsa.getHostString()); 354 favoredNodes = new InetSocketAddress[] { initialIsa }; 355 } 356 } 357 } 358 wl = getNewWriter(tableNameBytes, family, conf, favoredNodes); 359 360 } 361 362 // we now have the proper WAL writer. full steam ahead 363 PrivateCellUtil.updateLatestStamp(cell, this.now); 364 wl.writer.append(kv); 365 wl.written += length; 366 367 // Copy the row so we know when a row transition. 368 this.previousRows.put(family, rowKey); 369 } 370 371 private Path getTableRelativePath(byte[] tableNameBytes) { 372 String tableName = Bytes.toString(tableNameBytes); 373 String[] tableNameParts = tableName.split(":"); 374 Path tableRelPath = new Path(tableNameParts[0]); 375 if (tableNameParts.length > 1) { 376 tableRelPath = new Path(tableRelPath, tableNameParts[1]); 377 } 378 return tableRelPath; 379 } 380 381 private void rollWriters(WriterLength writerLength) throws IOException { 382 if (writerLength != null) { 383 closeWriter(writerLength); 384 } else { 385 for (WriterLength wl : this.writers.values()) { 386 closeWriter(wl); 387 } 388 } 389 } 390 391 private void closeWriter(WriterLength wl) throws IOException { 392 if (wl.writer != null) { 393 LOG.info( 394 "Writer=" + wl.writer.getPath() + ((wl.written == 0) ? "" : ", wrote=" + wl.written)); 395 close(wl.writer); 396 wl.writer = null; 397 } 398 wl.written = 0; 399 } 400 401 private Configuration createRemoteClusterConf(Configuration conf) { 402 final Configuration newConf = new Configuration(conf); 403 404 final String quorum = conf.get(REMOTE_CLUSTER_ZOOKEEPER_QUORUM_CONF_KEY); 405 final String clientPort = conf.get(REMOTE_CLUSTER_ZOOKEEPER_CLIENT_PORT_CONF_KEY); 406 final String parent = conf.get(REMOTE_CLUSTER_ZOOKEEPER_ZNODE_PARENT_CONF_KEY); 407 408 if (quorum != null && clientPort != null && parent != null) { 409 newConf.set(HConstants.ZOOKEEPER_QUORUM, quorum); 410 newConf.setInt(HConstants.ZOOKEEPER_CLIENT_PORT, Integer.parseInt(clientPort)); 411 newConf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, parent); 412 } 413 414 for (Entry<String, String> entry : conf) { 415 String key = entry.getKey(); 416 if ( 417 REMOTE_CLUSTER_ZOOKEEPER_QUORUM_CONF_KEY.equals(key) 418 || REMOTE_CLUSTER_ZOOKEEPER_CLIENT_PORT_CONF_KEY.equals(key) 419 || REMOTE_CLUSTER_ZOOKEEPER_ZNODE_PARENT_CONF_KEY.equals(key) 420 ) { 421 // Handled them above 422 continue; 423 } 424 425 if (entry.getKey().startsWith(REMOTE_CLUSTER_CONF_PREFIX)) { 426 String originalKey = entry.getKey().substring(REMOTE_CLUSTER_CONF_PREFIX.length()); 427 if (!originalKey.isEmpty()) { 428 newConf.set(originalKey, entry.getValue()); 429 } 430 } 431 } 432 433 return newConf; 434 } 435 436 /* 437 * Create a new StoreFile.Writer. 438 * @return A WriterLength, containing a new StoreFile.Writer. 439 */ 440 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "BX_UNBOXING_IMMEDIATELY_REBOXED", 441 justification = "Not important") 442 private WriterLength getNewWriter(byte[] tableName, byte[] family, Configuration conf, 443 InetSocketAddress[] favoredNodes) throws IOException { 444 byte[] tableAndFamily = getTableNameSuffixedWithFamily(tableName, family); 445 Path familydir = new Path(outputDir, Bytes.toString(family)); 446 if (writeMultipleTables) { 447 familydir = 448 new Path(outputDir, new Path(getTableRelativePath(tableName), Bytes.toString(family))); 449 } 450 WriterLength wl = new WriterLength(); 451 Algorithm compression = overriddenCompression; 452 compression = compression == null ? compressionMap.get(tableAndFamily) : compression; 453 compression = compression == null ? defaultCompression : compression; 454 BloomType bloomType = bloomTypeMap.get(tableAndFamily); 455 bloomType = bloomType == null ? BloomType.NONE : bloomType; 456 String bloomParam = bloomParamMap.get(tableAndFamily); 457 if (bloomType == BloomType.ROWPREFIX_FIXED_LENGTH) { 458 conf.set(BloomFilterUtil.PREFIX_LENGTH_KEY, bloomParam); 459 } 460 Integer blockSize = blockSizeMap.get(tableAndFamily); 461 blockSize = blockSize == null ? HConstants.DEFAULT_BLOCKSIZE : blockSize; 462 DataBlockEncoding encoding = overriddenEncoding; 463 encoding = encoding == null ? datablockEncodingMap.get(tableAndFamily) : encoding; 464 encoding = encoding == null ? DataBlockEncoding.NONE : encoding; 465 HFileContextBuilder contextBuilder = new HFileContextBuilder().withCompression(compression) 466 .withDataBlockEncoding(encoding).withChecksumType(StoreUtils.getChecksumType(conf)) 467 .withBytesPerCheckSum(StoreUtils.getBytesPerChecksum(conf)).withBlockSize(blockSize) 468 .withColumnFamily(family).withTableName(tableName) 469 .withCreateTime(EnvironmentEdgeManager.currentTime()); 470 471 if (HFile.getFormatVersion(conf) >= HFile.MIN_FORMAT_VERSION_WITH_TAGS) { 472 contextBuilder.withIncludesTags(true); 473 } 474 475 HFileContext hFileContext = contextBuilder.build(); 476 if (null == favoredNodes) { 477 wl.writer = 478 new StoreFileWriter.Builder(conf, CacheConfig.DISABLED, fs).withOutputDir(familydir) 479 .withBloomType(bloomType).withFileContext(hFileContext).build(); 480 } else { 481 wl.writer = new StoreFileWriter.Builder(conf, CacheConfig.DISABLED, new HFileSystem(fs)) 482 .withOutputDir(familydir).withBloomType(bloomType).withFileContext(hFileContext) 483 .withFavoredNodes(favoredNodes).build(); 484 } 485 486 this.writers.put(tableAndFamily, wl); 487 return wl; 488 } 489 490 private void close(final StoreFileWriter w) throws IOException { 491 if (w != null) { 492 w.appendFileInfo(BULKLOAD_TIME_KEY, Bytes.toBytes(EnvironmentEdgeManager.currentTime())); 493 w.appendFileInfo(BULKLOAD_TASK_KEY, Bytes.toBytes(context.getTaskAttemptID().toString())); 494 w.appendFileInfo(MAJOR_COMPACTION_KEY, Bytes.toBytes(true)); 495 w.appendFileInfo(EXCLUDE_FROM_MINOR_COMPACTION_KEY, Bytes.toBytes(compactionExclude)); 496 w.appendTrackedTimestampsToMetadata(); 497 w.close(); 498 } 499 } 500 501 @Override 502 public void close(TaskAttemptContext c) throws IOException, InterruptedException { 503 for (WriterLength wl : this.writers.values()) { 504 close(wl.writer); 505 } 506 } 507 }; 508 } 509 510 /** 511 * Configure block storage policy for CF after the directory is created. 512 */ 513 static void configureStoragePolicy(final Configuration conf, final FileSystem fs, 514 byte[] tableAndFamily, Path cfPath) { 515 if (null == conf || null == fs || null == tableAndFamily || null == cfPath) { 516 return; 517 } 518 519 String policy = conf.get(STORAGE_POLICY_PROPERTY_CF_PREFIX + Bytes.toString(tableAndFamily), 520 conf.get(STORAGE_POLICY_PROPERTY)); 521 CommonFSUtils.setStoragePolicy(fs, cfPath, policy); 522 } 523 524 /* 525 * Data structure to hold a Writer and amount of data written on it. 526 */ 527 static class WriterLength { 528 long written = 0; 529 StoreFileWriter writer = null; 530 } 531 532 /** 533 * Return the start keys of all of the regions in this table, as a list of ImmutableBytesWritable. 534 */ 535 private static List<ImmutableBytesWritable> getRegionStartKeys(List<RegionLocator> regionLocators, 536 boolean writeMultipleTables) throws IOException { 537 538 ArrayList<ImmutableBytesWritable> ret = new ArrayList<>(); 539 for (RegionLocator regionLocator : regionLocators) { 540 TableName tableName = regionLocator.getName(); 541 LOG.info("Looking up current regions for table " + tableName); 542 byte[][] byteKeys = regionLocator.getStartKeys(); 543 for (byte[] byteKey : byteKeys) { 544 byte[] fullKey = byteKey; // HFileOutputFormat2 use case 545 if (writeMultipleTables) { 546 // MultiTableHFileOutputFormat use case 547 fullKey = combineTableNameSuffix(tableName.getName(), byteKey); 548 } 549 if (LOG.isDebugEnabled()) { 550 LOG.debug("SplitPoint startkey for " + tableName + ": " + Bytes.toStringBinary(fullKey)); 551 } 552 ret.add(new ImmutableBytesWritable(fullKey)); 553 } 554 } 555 return ret; 556 } 557 558 /** 559 * Write out a {@link SequenceFile} that can be read by {@link TotalOrderPartitioner} that 560 * contains the split points in startKeys. 561 */ 562 @SuppressWarnings("deprecation") 563 private static void writePartitions(Configuration conf, Path partitionsPath, 564 List<ImmutableBytesWritable> startKeys, boolean writeMultipleTables) throws IOException { 565 LOG.info("Writing partition information to " + partitionsPath); 566 if (startKeys.isEmpty()) { 567 throw new IllegalArgumentException("No regions passed"); 568 } 569 570 // We're generating a list of split points, and we don't ever 571 // have keys < the first region (which has an empty start key) 572 // so we need to remove it. Otherwise we would end up with an 573 // empty reducer with index 0 574 TreeSet<ImmutableBytesWritable> sorted = new TreeSet<>(startKeys); 575 ImmutableBytesWritable first = sorted.first(); 576 if (writeMultipleTables) { 577 first = 578 new ImmutableBytesWritable(MultiTableHFileOutputFormat.getSuffix(sorted.first().get())); 579 } 580 if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)) { 581 throw new IllegalArgumentException( 582 "First region of table should have empty start key. Instead has: " 583 + Bytes.toStringBinary(first.get())); 584 } 585 sorted.remove(sorted.first()); 586 587 // Write the actual file 588 FileSystem fs = partitionsPath.getFileSystem(conf); 589 boolean diskBasedSortingEnabled = diskBasedSortingEnabled(conf); 590 Class<? extends Writable> keyClass = 591 diskBasedSortingEnabled ? KeyOnlyCellComparable.class : ImmutableBytesWritable.class; 592 SequenceFile.Writer writer = 593 SequenceFile.createWriter(fs, conf, partitionsPath, keyClass, NullWritable.class); 594 595 try { 596 for (ImmutableBytesWritable startKey : sorted) { 597 Writable writable = diskBasedSortingEnabled 598 ? new KeyOnlyCellComparable(KeyValueUtil.createFirstOnRow(startKey.get())) 599 : startKey; 600 601 writer.append(writable, NullWritable.get()); 602 } 603 } finally { 604 writer.close(); 605 } 606 } 607 608 /** 609 * Configure a MapReduce Job to perform an incremental load into the given table. This 610 * <ul> 611 * <li>Inspects the table to configure a total order partitioner</li> 612 * <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li> 613 * <li>Sets the number of reduce tasks to match the current number of regions</li> 614 * <li>Sets the output key/value class to match HFileOutputFormat2's requirements</li> 615 * <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or 616 * PutSortReducer)</li> 617 * <li>Sets the HBase cluster key to load region locations for locality-sensitive</li> 618 * </ul> 619 * The user should be sure to set the map output value class to either KeyValue or Put before 620 * running this function. 621 */ 622 public static void configureIncrementalLoad(Job job, Table table, RegionLocator regionLocator) 623 throws IOException { 624 configureIncrementalLoad(job, table.getDescriptor(), regionLocator); 625 configureRemoteCluster(job, table.getConfiguration()); 626 } 627 628 /** 629 * Configure a MapReduce Job to perform an incremental load into the given table. This 630 * <ul> 631 * <li>Inspects the table to configure a total order partitioner</li> 632 * <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li> 633 * <li>Sets the number of reduce tasks to match the current number of regions</li> 634 * <li>Sets the output key/value class to match HFileOutputFormat2's requirements</li> 635 * <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or 636 * PutSortReducer)</li> 637 * </ul> 638 * The user should be sure to set the map output value class to either KeyValue or Put before 639 * running this function. 640 */ 641 public static void configureIncrementalLoad(Job job, TableDescriptor tableDescriptor, 642 RegionLocator regionLocator) throws IOException { 643 ArrayList<TableInfo> singleTableInfo = new ArrayList<>(); 644 singleTableInfo.add(new TableInfo(tableDescriptor, regionLocator)); 645 configureIncrementalLoad(job, singleTableInfo, HFileOutputFormat2.class); 646 } 647 648 public static boolean diskBasedSortingEnabled(Configuration conf) { 649 return conf.getBoolean(DISK_BASED_SORTING_ENABLED_KEY, DISK_BASED_SORTING_ENABLED_DEFAULT); 650 } 651 652 static void configureIncrementalLoad(Job job, List<TableInfo> multiTableInfo, 653 Class<? extends OutputFormat<?, ?>> cls) throws IOException { 654 Configuration conf = job.getConfiguration(); 655 job.setOutputKeyClass(ImmutableBytesWritable.class); 656 job.setOutputValueClass(MapReduceExtendedCell.class); 657 job.setOutputFormatClass(cls); 658 659 final boolean writeToTableWithNamespace = conf.getBoolean( 660 TABLE_NAME_WITH_NAMESPACE_INCLUSIVE_KEY, TABLE_NAME_WITH_NAMESPACE_INCLUSIVE_DEFAULT_VALUE); 661 662 if (multiTableInfo.stream().distinct().count() != multiTableInfo.size()) { 663 throw new IllegalArgumentException("Duplicate entries found in TableInfo argument"); 664 } 665 boolean writeMultipleTables = false; 666 if (MultiTableHFileOutputFormat.class.equals(cls)) { 667 writeMultipleTables = true; 668 conf.setBoolean(MULTI_TABLE_HFILEOUTPUTFORMAT_CONF_KEY, true); 669 } 670 // Based on the configured map output class, set the correct reducer to properly 671 // sort the incoming values. 672 // TODO it would be nice to pick one or the other of these formats. 673 boolean diskBasedSorting = diskBasedSortingEnabled(conf); 674 675 if (diskBasedSorting) { 676 job.setMapOutputKeyClass(KeyOnlyCellComparable.class); 677 job.setSortComparatorClass(KeyOnlyCellComparable.KeyOnlyCellComparator.class); 678 job.setReducerClass(PreSortedCellsReducer.class); 679 } else if ( 680 KeyValue.class.equals(job.getMapOutputValueClass()) 681 || MapReduceExtendedCell.class.equals(job.getMapOutputValueClass()) 682 ) { 683 job.setReducerClass(CellSortReducer.class); 684 } else if (Put.class.equals(job.getMapOutputValueClass())) { 685 job.setReducerClass(PutSortReducer.class); 686 } else if (Text.class.equals(job.getMapOutputValueClass())) { 687 job.setReducerClass(TextSortReducer.class); 688 } else { 689 LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass()); 690 } 691 692 mergeSerializations(conf); 693 694 if (conf.getBoolean(LOCALITY_SENSITIVE_CONF_KEY, DEFAULT_LOCALITY_SENSITIVE)) { 695 LOG.info("bulkload locality sensitive enabled"); 696 } 697 698 /* Now get the region start keys for every table required */ 699 List<String> allTableNames = new ArrayList<>(multiTableInfo.size()); 700 List<RegionLocator> regionLocators = new ArrayList<>(multiTableInfo.size()); 701 List<TableDescriptor> tableDescriptors = new ArrayList<>(multiTableInfo.size()); 702 703 for (TableInfo tableInfo : multiTableInfo) { 704 regionLocators.add(tableInfo.getRegionLocator()); 705 allTableNames.add(writeMultipleTables && writeToTableWithNamespace 706 ? tableInfo.getRegionLocator().getName().getNameWithNamespaceInclAsString() 707 : tableInfo.getRegionLocator().getName().getNameAsString()); 708 tableDescriptors.add(tableInfo.getTableDescriptor()); 709 } 710 // Record tablenames for creating writer by favored nodes, and decoding compression, 711 // block size and other attributes of columnfamily per table 712 conf.set(OUTPUT_TABLE_NAME_CONF_KEY, 713 StringUtils.join(allTableNames, Bytes.toString(tableSeparator))); 714 List<ImmutableBytesWritable> startKeys = 715 getRegionStartKeys(regionLocators, writeMultipleTables); 716 // Use table's region boundaries for TOP split points. 717 LOG.info("Configuring " + startKeys.size() + " reduce partitions " 718 + "to match current region count for all tables"); 719 job.setNumReduceTasks(startKeys.size()); 720 721 configurePartitioner(job, startKeys, writeMultipleTables); 722 // Set compression algorithms based on column families 723 724 conf.set(COMPRESSION_FAMILIES_CONF_KEY, 725 serializeColumnFamilyAttribute(compressionDetails, tableDescriptors)); 726 conf.set(BLOCK_SIZE_FAMILIES_CONF_KEY, 727 serializeColumnFamilyAttribute(blockSizeDetails, tableDescriptors)); 728 conf.set(BLOOM_TYPE_FAMILIES_CONF_KEY, 729 serializeColumnFamilyAttribute(bloomTypeDetails, tableDescriptors)); 730 conf.set(BLOOM_PARAM_FAMILIES_CONF_KEY, 731 serializeColumnFamilyAttribute(bloomParamDetails, tableDescriptors)); 732 conf.set(DATABLOCK_ENCODING_FAMILIES_CONF_KEY, 733 serializeColumnFamilyAttribute(dataBlockEncodingDetails, tableDescriptors)); 734 735 TableMapReduceUtil.addDependencyJars(job); 736 TableMapReduceUtil.initCredentials(job); 737 LOG.info("Incremental output configured for tables: " + StringUtils.join(allTableNames, ",")); 738 } 739 740 private static void mergeSerializations(Configuration conf) { 741 List<String> serializations = new ArrayList<>(); 742 743 // add any existing values that have been set 744 String[] existing = conf.getStrings("io.serializations"); 745 if (existing != null) { 746 Collections.addAll(serializations, existing); 747 } 748 749 serializations.add(MutationSerialization.class.getName()); 750 serializations.add(ResultSerialization.class.getName()); 751 752 // Add ExtendedCellSerialization, if configured. Order matters here. Hadoop's 753 // SerializationFactory runs through serializations in the order they are registered. 754 // We want to register ExtendedCellSerialization before CellSerialization because both 755 // work for ExtendedCells but only ExtendedCellSerialization handles them properly. 756 if ( 757 conf.getBoolean(EXTENDED_CELL_SERIALIZATION_ENABLED_KEY, 758 EXTENDED_CELL_SERIALIZATION_ENABLED_DEFULT) 759 ) { 760 serializations.add(ExtendedCellSerialization.class.getName()); 761 } 762 serializations.add(CellSerialization.class.getName()); 763 764 conf.setStrings("io.serializations", serializations.toArray(new String[0])); 765 } 766 767 public static void configureIncrementalLoadMap(Job job, TableDescriptor tableDescriptor) 768 throws IOException { 769 Configuration conf = job.getConfiguration(); 770 771 job.setOutputKeyClass(ImmutableBytesWritable.class); 772 job.setOutputValueClass(MapReduceExtendedCell.class); 773 job.setOutputFormatClass(HFileOutputFormat2.class); 774 775 ArrayList<TableDescriptor> singleTableDescriptor = new ArrayList<>(1); 776 singleTableDescriptor.add(tableDescriptor); 777 778 conf.set(OUTPUT_TABLE_NAME_CONF_KEY, tableDescriptor.getTableName().getNameAsString()); 779 // Set compression algorithms based on column families 780 conf.set(COMPRESSION_FAMILIES_CONF_KEY, 781 serializeColumnFamilyAttribute(compressionDetails, singleTableDescriptor)); 782 conf.set(BLOCK_SIZE_FAMILIES_CONF_KEY, 783 serializeColumnFamilyAttribute(blockSizeDetails, singleTableDescriptor)); 784 conf.set(BLOOM_TYPE_FAMILIES_CONF_KEY, 785 serializeColumnFamilyAttribute(bloomTypeDetails, singleTableDescriptor)); 786 conf.set(BLOOM_PARAM_FAMILIES_CONF_KEY, 787 serializeColumnFamilyAttribute(bloomParamDetails, singleTableDescriptor)); 788 conf.set(DATABLOCK_ENCODING_FAMILIES_CONF_KEY, 789 serializeColumnFamilyAttribute(dataBlockEncodingDetails, singleTableDescriptor)); 790 791 TableMapReduceUtil.addDependencyJars(job); 792 TableMapReduceUtil.initCredentials(job); 793 LOG.info("Incremental table " + tableDescriptor.getTableName() + " output configured."); 794 } 795 796 /** 797 * Configure HBase cluster key for remote cluster to load region location for locality-sensitive 798 * if it's enabled. It's not necessary to call this method explicitly when the cluster key for 799 * HBase cluster to be used to load region location is configured in the job configuration. Call 800 * this method when another HBase cluster key is configured in the job configuration. For example, 801 * you should call when you load data from HBase cluster A using {@link TableInputFormat} and 802 * generate hfiles for HBase cluster B. Otherwise, HFileOutputFormat2 fetch location from cluster 803 * A and locality-sensitive won't working correctly. 804 * {@link #configureIncrementalLoad(Job, Table, RegionLocator)} calls this method using 805 * {@link Table#getConfiguration} as clusterConf. See HBASE-25608. 806 * @param job which has configuration to be updated 807 * @param clusterConf which contains cluster key of the HBase cluster to be locality-sensitive 808 * @see #configureIncrementalLoad(Job, Table, RegionLocator) 809 * @see #LOCALITY_SENSITIVE_CONF_KEY 810 * @see #REMOTE_CLUSTER_ZOOKEEPER_QUORUM_CONF_KEY 811 * @see #REMOTE_CLUSTER_ZOOKEEPER_CLIENT_PORT_CONF_KEY 812 * @see #REMOTE_CLUSTER_ZOOKEEPER_ZNODE_PARENT_CONF_KEY 813 */ 814 public static void configureRemoteCluster(Job job, Configuration clusterConf) { 815 Configuration conf = job.getConfiguration(); 816 817 if (!conf.getBoolean(LOCALITY_SENSITIVE_CONF_KEY, DEFAULT_LOCALITY_SENSITIVE)) { 818 return; 819 } 820 821 final String quorum = clusterConf.get(HConstants.ZOOKEEPER_QUORUM); 822 final int clientPort = clusterConf.getInt(HConstants.ZOOKEEPER_CLIENT_PORT, 823 HConstants.DEFAULT_ZOOKEEPER_CLIENT_PORT); 824 final String parent = 825 clusterConf.get(HConstants.ZOOKEEPER_ZNODE_PARENT, HConstants.DEFAULT_ZOOKEEPER_ZNODE_PARENT); 826 827 conf.set(REMOTE_CLUSTER_ZOOKEEPER_QUORUM_CONF_KEY, quorum); 828 conf.setInt(REMOTE_CLUSTER_ZOOKEEPER_CLIENT_PORT_CONF_KEY, clientPort); 829 conf.set(REMOTE_CLUSTER_ZOOKEEPER_ZNODE_PARENT_CONF_KEY, parent); 830 831 LOG.info("ZK configs for remote cluster of bulkload is configured: " + quorum + ":" + clientPort 832 + "/" + parent); 833 } 834 835 /** 836 * Runs inside the task to deserialize column family to compression algorithm map from the 837 * configuration. 838 * @param conf to read the serialized values from 839 * @return a map from column family to the configured compression algorithm 840 */ 841 @InterfaceAudience.Private 842 static Map<byte[], Algorithm> createFamilyCompressionMap(Configuration conf) { 843 Map<byte[], String> stringMap = createFamilyConfValueMap(conf, COMPRESSION_FAMILIES_CONF_KEY); 844 Map<byte[], Algorithm> compressionMap = new TreeMap<>(Bytes.BYTES_COMPARATOR); 845 for (Map.Entry<byte[], String> e : stringMap.entrySet()) { 846 Algorithm algorithm = HFileWriterImpl.compressionByName(e.getValue()); 847 compressionMap.put(e.getKey(), algorithm); 848 } 849 return compressionMap; 850 } 851 852 /** 853 * Runs inside the task to deserialize column family to bloom filter type map from the 854 * configuration. 855 * @param conf to read the serialized values from 856 * @return a map from column family to the the configured bloom filter type 857 */ 858 @InterfaceAudience.Private 859 static Map<byte[], BloomType> createFamilyBloomTypeMap(Configuration conf) { 860 Map<byte[], String> stringMap = createFamilyConfValueMap(conf, BLOOM_TYPE_FAMILIES_CONF_KEY); 861 Map<byte[], BloomType> bloomTypeMap = new TreeMap<>(Bytes.BYTES_COMPARATOR); 862 for (Map.Entry<byte[], String> e : stringMap.entrySet()) { 863 BloomType bloomType = BloomType.valueOf(e.getValue()); 864 bloomTypeMap.put(e.getKey(), bloomType); 865 } 866 return bloomTypeMap; 867 } 868 869 /** 870 * Runs inside the task to deserialize column family to bloom filter param map from the 871 * configuration. 872 * @param conf to read the serialized values from 873 * @return a map from column family to the the configured bloom filter param 874 */ 875 @InterfaceAudience.Private 876 static Map<byte[], String> createFamilyBloomParamMap(Configuration conf) { 877 return createFamilyConfValueMap(conf, BLOOM_PARAM_FAMILIES_CONF_KEY); 878 } 879 880 /** 881 * Runs inside the task to deserialize column family to block size map from the configuration. 882 * @param conf to read the serialized values from 883 * @return a map from column family to the configured block size 884 */ 885 @InterfaceAudience.Private 886 static Map<byte[], Integer> createFamilyBlockSizeMap(Configuration conf) { 887 Map<byte[], String> stringMap = createFamilyConfValueMap(conf, BLOCK_SIZE_FAMILIES_CONF_KEY); 888 Map<byte[], Integer> blockSizeMap = new TreeMap<>(Bytes.BYTES_COMPARATOR); 889 for (Map.Entry<byte[], String> e : stringMap.entrySet()) { 890 Integer blockSize = Integer.parseInt(e.getValue()); 891 blockSizeMap.put(e.getKey(), blockSize); 892 } 893 return blockSizeMap; 894 } 895 896 /** 897 * Runs inside the task to deserialize column family to data block encoding type map from the 898 * configuration. 899 * @param conf to read the serialized values from 900 * @return a map from column family to HFileDataBlockEncoder for the configured data block type 901 * for the family 902 */ 903 @InterfaceAudience.Private 904 static Map<byte[], DataBlockEncoding> createFamilyDataBlockEncodingMap(Configuration conf) { 905 Map<byte[], String> stringMap = 906 createFamilyConfValueMap(conf, DATABLOCK_ENCODING_FAMILIES_CONF_KEY); 907 Map<byte[], DataBlockEncoding> encoderMap = new TreeMap<>(Bytes.BYTES_COMPARATOR); 908 for (Map.Entry<byte[], String> e : stringMap.entrySet()) { 909 encoderMap.put(e.getKey(), DataBlockEncoding.valueOf((e.getValue()))); 910 } 911 return encoderMap; 912 } 913 914 /** 915 * Run inside the task to deserialize column family to given conf value map. 916 * @param conf to read the serialized values from 917 * @param confName conf key to read from the configuration 918 * @return a map of column family to the given configuration value 919 */ 920 private static Map<byte[], String> createFamilyConfValueMap(Configuration conf, String confName) { 921 Map<byte[], String> confValMap = new TreeMap<>(Bytes.BYTES_COMPARATOR); 922 String confVal = conf.get(confName, ""); 923 for (String familyConf : confVal.split("&")) { 924 String[] familySplit = familyConf.split("="); 925 if (familySplit.length != 2) { 926 continue; 927 } 928 try { 929 confValMap.put(Bytes.toBytes(URLDecoder.decode(familySplit[0], "UTF-8")), 930 URLDecoder.decode(familySplit[1], "UTF-8")); 931 } catch (UnsupportedEncodingException e) { 932 // will not happen with UTF-8 encoding 933 throw new AssertionError(e); 934 } 935 } 936 return confValMap; 937 } 938 939 /** 940 * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against 941 * <code>splitPoints</code>. Cleans up the partitions file after job exists. 942 */ 943 static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints, 944 boolean writeMultipleTables) throws IOException { 945 Configuration conf = job.getConfiguration(); 946 // create the partitions file 947 FileSystem fs = FileSystem.get(conf); 948 String hbaseTmpFsDir = 949 conf.get(HConstants.TEMPORARY_FS_DIRECTORY_KEY, HConstants.DEFAULT_TEMPORARY_HDFS_DIRECTORY); 950 Path partitionsPath = new Path(hbaseTmpFsDir, "partitions_" + UUID.randomUUID()); 951 fs.makeQualified(partitionsPath); 952 writePartitions(conf, partitionsPath, splitPoints, writeMultipleTables); 953 fs.deleteOnExit(partitionsPath); 954 955 // configure job to use it 956 job.setPartitionerClass(TotalOrderPartitioner.class); 957 TotalOrderPartitioner.setPartitionFile(conf, partitionsPath); 958 } 959 960 @edu.umd.cs.findbugs.annotations.SuppressWarnings( 961 value = "RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE") 962 @InterfaceAudience.Private 963 static String serializeColumnFamilyAttribute(Function<ColumnFamilyDescriptor, String> fn, 964 List<TableDescriptor> allTables) throws UnsupportedEncodingException { 965 StringBuilder attributeValue = new StringBuilder(); 966 int i = 0; 967 for (TableDescriptor tableDescriptor : allTables) { 968 if (tableDescriptor == null) { 969 // could happen with mock table instance 970 // CODEREVIEW: Can I set an empty string in conf if mock table instance? 971 return ""; 972 } 973 for (ColumnFamilyDescriptor familyDescriptor : tableDescriptor.getColumnFamilies()) { 974 if (i++ > 0) { 975 attributeValue.append('&'); 976 } 977 attributeValue.append(URLEncoder 978 .encode(Bytes.toString(combineTableNameSuffix(tableDescriptor.getTableName().getName(), 979 familyDescriptor.getName())), "UTF-8")); 980 attributeValue.append('='); 981 attributeValue.append(URLEncoder.encode(fn.apply(familyDescriptor), "UTF-8")); 982 } 983 } 984 // Get rid of the last ampersand 985 return attributeValue.toString(); 986 } 987 988 /** 989 * Serialize column family to compression algorithm map to configuration. Invoked while 990 * configuring the MR job for incremental load. 991 */ 992 @InterfaceAudience.Private 993 static Function<ColumnFamilyDescriptor, String> compressionDetails = 994 familyDescriptor -> familyDescriptor.getCompressionType().getName(); 995 996 /** 997 * Serialize column family to block size map to configuration. Invoked while configuring the MR 998 * job for incremental load. 999 */ 1000 @InterfaceAudience.Private 1001 static Function<ColumnFamilyDescriptor, String> blockSizeDetails = 1002 familyDescriptor -> String.valueOf(familyDescriptor.getBlocksize()); 1003 1004 /** 1005 * Serialize column family to bloom type map to configuration. Invoked while configuring the MR 1006 * job for incremental load. 1007 */ 1008 @InterfaceAudience.Private 1009 static Function<ColumnFamilyDescriptor, String> bloomTypeDetails = familyDescriptor -> { 1010 String bloomType = familyDescriptor.getBloomFilterType().toString(); 1011 if (bloomType == null) { 1012 bloomType = ColumnFamilyDescriptorBuilder.DEFAULT_BLOOMFILTER.name(); 1013 } 1014 return bloomType; 1015 }; 1016 1017 /** 1018 * Serialize column family to bloom param map to configuration. Invoked while configuring the MR 1019 * job for incremental load. 1020 */ 1021 @InterfaceAudience.Private 1022 static Function<ColumnFamilyDescriptor, String> bloomParamDetails = familyDescriptor -> { 1023 BloomType bloomType = familyDescriptor.getBloomFilterType(); 1024 String bloomParam = ""; 1025 if (bloomType == BloomType.ROWPREFIX_FIXED_LENGTH) { 1026 bloomParam = familyDescriptor.getConfigurationValue(BloomFilterUtil.PREFIX_LENGTH_KEY); 1027 } 1028 return bloomParam; 1029 }; 1030 1031 /** 1032 * Serialize column family to data block encoding map to configuration. Invoked while configuring 1033 * the MR job for incremental load. 1034 */ 1035 @InterfaceAudience.Private 1036 static Function<ColumnFamilyDescriptor, String> dataBlockEncodingDetails = familyDescriptor -> { 1037 DataBlockEncoding encoding = familyDescriptor.getDataBlockEncoding(); 1038 if (encoding == null) { 1039 encoding = DataBlockEncoding.NONE; 1040 } 1041 return encoding.toString(); 1042 }; 1043 1044}