Presto から見ると (parquetの)page単位で IO して、HDFS の API を叩いて、HDFS は DSDataInputStream とかで読んで、OSレイヤーからみると sendfile(2) で xfs などのファイルシステムのファイルを読んでということになってるのではないかと推察。
— (@yoheia) 2018年10月9日
1)flame graphでhdfsのioシステムコールを発行元コールスタックを特定
— (@yoheia) 2018年10月14日
Column chunks
Column chunks are composed of pages written back to back. The pages share a common header and readers can skip over page they are not interested in. The data for the page follows the header and can be compressed and/or encoded. The compression and encoding is specified in the page metadata.
(1) read only required columns in Parquet and build columnar blocks on the fly, saving CPU and memory to transform row-based Parquet records into columnar blocks, and (2) evaluate the predicate using columnar blocks in the Presto engine.
Engineering Data Analytics with Presto and Parquet at Uber
New Hive Parquet Reader
We have added a new Parquet reader implementation. The new reader supports vectorized reads, lazy loading, and predicate push down, all of which make the reader more efficient and typically reduces wall clock time for a query. Although the new reader has been heavily tested, it is an extensive rewrite of the Apache Hive Parquet reader, and may have some latent issues, so it is not enabled by default. If you are using Parquet we suggest you test out the new reader on a per-query basis by setting the.parquet_optimized_reader_enabled session property, or you can enable the reader by default by setting the Hive catalog property hive.parquet-optimized-reader.enabled=true. To enable Parquet predicate push down there is a separate session property .parquet_predicate_pushdown_enabled and configuration property hive.parquet-predicate-pushdown.enabled=true.
Hadoop Internals for Oracle Developers and DBAs: Strata Conference + Hadoop World 2013 - O'Reilly Conferences, October 28 - 30, 2013, New York, NY
/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.presto.parquet.reader; import com.facebook.presto.parquet.RichColumnDescriptor; import com.facebook.presto.spi.block.BlockBuilder; import com.facebook.presto.spi.type.Type; import io.airlift.slice.Slice; import; import static com.facebook.presto.spi.type.Chars.isCharType; import static com.facebook.presto.spi.type.Chars.truncateToLengthAndTrimSpaces; import static com.facebook.presto.spi.type.Varchars.isVarcharType; import static com.facebook.presto.spi.type.Varchars.truncateToLength; import static io.airlift.slice.Slices.EMPTY_SLICE; import static io.airlift.slice.Slices.wrappedBuffer; public class BinaryColumnReader extends PrimitiveColumnReader { public BinaryColumnReader(RichColumnDescriptor descriptor) { super(descriptor); } @Override protected void readValue(BlockBuilder blockBuilder, Type type) { if (definitionLevel == columnDescriptor.getMaxDefinitionLevel()) { Binary binary = valuesReader.readBytes(); Slice value; if (binary.length() == 0) { value = EMPTY_SLICE; } else { value = wrappedBuffer(binary.getBytes()); } if (isVarcharType(type)) { value = truncateToLength(value, type); } if (isCharType(type)) { value = truncateToLengthAndTrimSpaces(value, type); } type.writeSlice(blockBuilder, value); } else if (isValueNull()) { blockBuilder.appendNull(); } } @Override protected void skipValue() { if (definitionLevel == columnDescriptor.getMaxDefinitionLevel()) { valuesReader.readBytes(); } } }
/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.presto.parquet.reader; import com.facebook.presto.parquet.DataPage; import com.facebook.presto.parquet.DataPageV1; import com.facebook.presto.parquet.DataPageV2; import com.facebook.presto.parquet.DictionaryPage; import com.facebook.presto.parquet.Field; import com.facebook.presto.parquet.ParquetEncoding; import com.facebook.presto.parquet.ParquetTypeUtils; import com.facebook.presto.parquet.RichColumnDescriptor; import com.facebook.presto.parquet.dictionary.Dictionary; import com.facebook.presto.spi.PrestoException; import com.facebook.presto.spi.block.BlockBuilder; import com.facebook.presto.spi.type.DecimalType; import com.facebook.presto.spi.type.Type; import io.airlift.slice.Slice; import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntList; import parquet.bytes.BytesUtils; import parquet.column.ColumnDescriptor; import parquet.column.values.ValuesReader; import parquet.column.values.rle.RunLengthBitPackingHybridDecoder; import; import; import; import java.util.Optional; import java.util.function.Consumer; import static com.facebook.presto.parquet.ParquetTypeUtils.createDecimalType; import static com.facebook.presto.parquet.ValuesType.DEFINITION_LEVEL; import static com.facebook.presto.parquet.ValuesType.REPETITION_LEVEL; import static com.facebook.presto.parquet.ValuesType.VALUES; import static com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED; import static; import static; import static java.util.Objects.requireNonNull; public abstract class PrimitiveColumnReader { private static final int EMPTY_LEVEL_VALUE = -1; protected final RichColumnDescriptor columnDescriptor; protected int definitionLevel = EMPTY_LEVEL_VALUE; protected int repetitionLevel = EMPTY_LEVEL_VALUE; protected ValuesReader valuesReader; private int nextBatchSize; private LevelReader repetitionReader; private LevelReader definitionReader; private long totalValueCount; private PageReader pageReader; private Dictionary dictionary; private int currentValueCount; private DataPage page; private int remainingValueCountInPage; private int readOffset; protected abstract void readValue(BlockBuilder blockBuilder, Type type); protected abstract void skipValue(); protected boolean isValueNull() { return ParquetTypeUtils.isValueNull(columnDescriptor.isRequired(), definitionLevel, columnDescriptor.getMaxDefinitionLevel()); } public static PrimitiveColumnReader createReader(RichColumnDescriptor descriptor) { switch (descriptor.getType()) { case BOOLEAN: return new BooleanColumnReader(descriptor); case INT32: return createDecimalColumnReader(descriptor).orElse(new IntColumnReader(descriptor)); case INT64: return createDecimalColumnReader(descriptor).orElse(new LongColumnReader(descriptor)); case INT96: return new TimestampColumnReader(descriptor); case FLOAT: return new FloatColumnReader(descriptor); case DOUBLE: return new DoubleColumnReader(descriptor); case BINARY: return createDecimalColumnReader(descriptor).orElse(new BinaryColumnReader(descriptor)); case FIXED_LEN_BYTE_ARRAY: return createDecimalColumnReader(descriptor) .orElseThrow(() -> new PrestoException(NOT_SUPPORTED, " type FIXED_LEN_BYTE_ARRAY supported as DECIMAL; got " + descriptor.getPrimitiveType().getOriginalType())); default: throw new PrestoException(NOT_SUPPORTED, "Unsupported parquet type: " + descriptor.getType()); } } private static Optional<PrimitiveColumnReader> createDecimalColumnReader(RichColumnDescriptor descriptor) { Optional<Type> type = createDecimalType(descriptor); if (type.isPresent()) { DecimalType decimalType = (DecimalType) type.get(); return Optional.of(DecimalColumnReaderFactory.createReader(descriptor, decimalType.getPrecision(), decimalType.getScale())); } return Optional.empty(); } public PrimitiveColumnReader(RichColumnDescriptor columnDescriptor) { this.columnDescriptor = requireNonNull(columnDescriptor, "columnDescriptor"); pageReader = null; } public PageReader getPageReader() { return pageReader; } public void setPageReader(PageReader pageReader) { this.pageReader = requireNonNull(pageReader, "pageReader"); DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); if (dictionaryPage != null) { try { dictionary = dictionaryPage.getEncoding().initDictionary(columnDescriptor, dictionaryPage); } catch (IOException e) { throw new ParquetDecodingException("could not decode the dictionary for " + columnDescriptor, e); } } else { dictionary = null; } checkArgument(pageReader.getTotalValueCount() > 0, "page is empty"); totalValueCount = pageReader.getTotalValueCount(); } public void prepareNextRead(int batchSize) { readOffset = readOffset + nextBatchSize; nextBatchSize = batchSize; } public ColumnDescriptor getDescriptor() { return columnDescriptor; } public ColumnChunk readPrimitive(Field field) throws IOException { IntList definitionLevels = new IntArrayList(); IntList repetitionLevels = new IntArrayList(); seek(); BlockBuilder blockBuilder = field.getType().createBlockBuilder(null, nextBatchSize); int valueCount = 0; while (valueCount < nextBatchSize) { if (page == null) { readNextPage(); } int valuesToRead = Math.min(remainingValueCountInPage, nextBatchSize - valueCount); readValues(blockBuilder, valuesToRead, field.getType(), definitionLevels, repetitionLevels); valueCount += valuesToRead; } checkArgument(valueCount == nextBatchSize, "valueCount %s not equals to batchSize %s", valueCount, nextBatchSize); readOffset = 0; nextBatchSize = 0; return new ColumnChunk(, definitionLevels.toIntArray(), repetitionLevels.toIntArray()); } private void readValues(BlockBuilder blockBuilder, int valuesToRead, Type type, IntList definitionLevels, IntList repetitionLevels) { processValues(valuesToRead, ignored -> { readValue(blockBuilder, type); definitionLevels.add(definitionLevel); repetitionLevels.add(repetitionLevel); }); } private void skipValues(int valuesToRead) { processValues(valuesToRead, ignored -> skipValue()); } private void processValues(int valuesToRead, Consumer<Void> valueConsumer) { if (definitionLevel == EMPTY_LEVEL_VALUE && repetitionLevel == EMPTY_LEVEL_VALUE) { definitionLevel = definitionReader.readLevel(); repetitionLevel = repetitionReader.readLevel(); } int valueCount = 0; for (int i = 0; i < valuesToRead; i++) { do { valueConsumer.accept(null); valueCount++; if (valueCount == remainingValueCountInPage) { updateValueCounts(valueCount); if (!readNextPage()) { return; } valueCount = 0; } repetitionLevel = repetitionReader.readLevel(); definitionLevel = definitionReader.readLevel(); } while (repetitionLevel != 0); } updateValueCounts(valueCount); } private void seek() { checkArgument(currentValueCount <= totalValueCount, "Already read all values in column chunk"); if (readOffset == 0) { return; } int valuePosition = 0; while (valuePosition < readOffset) { if (page == null) { readNextPage(); } int offset = Math.min(remainingValueCountInPage, readOffset - valuePosition); skipValues(offset); valuePosition = valuePosition + offset; } checkArgument(valuePosition == readOffset, "valuePosition %s must be equal to readOffset %s", valuePosition, readOffset); } private boolean readNextPage() { verify(page == null, "readNextPage has to be called when page is null"); page = pageReader.readPage(); if (page == null) { // we have read all pages return false; } remainingValueCountInPage = page.getValueCount(); if (page instanceof DataPageV1) { valuesReader = readPageV1((DataPageV1) page); } else { valuesReader = readPageV2((DataPageV2) page); } return true; } private void updateValueCounts(int valuesRead) { if (valuesRead == remainingValueCountInPage) { page = null; valuesReader = null; } remainingValueCountInPage -= valuesRead; currentValueCount += valuesRead; } private ValuesReader readPageV1(DataPageV1 page) { ValuesReader rlReader = page.getRepetitionLevelEncoding().getValuesReader(columnDescriptor, REPETITION_LEVEL); ValuesReader dlReader = page.getDefinitionLevelEncoding().getValuesReader(columnDescriptor, DEFINITION_LEVEL); repetitionReader = new LevelValuesReader(rlReader); definitionReader = new LevelValuesReader(dlReader); try { byte[] bytes = page.getSlice().getBytes(); rlReader.initFromPage(page.getValueCount(), bytes, 0); int offset = rlReader.getNextOffset(); dlReader.initFromPage(page.getValueCount(), bytes, offset); offset = dlReader.getNextOffset(); return initDataReader(page.getValueEncoding(), bytes, offset, page.getValueCount()); } catch (IOException e) { throw new ParquetDecodingException("Error reading parquet page " + page + " in column " + columnDescriptor, e); } } private ValuesReader readPageV2(DataPageV2 page) { repetitionReader = buildLevelRLEReader(columnDescriptor.getMaxRepetitionLevel(), page.getRepetitionLevels()); definitionReader = buildLevelRLEReader(columnDescriptor.getMaxDefinitionLevel(), page.getDefinitionLevels()); return initDataReader(page.getDataEncoding(), page.getSlice().getBytes(), 0, page.getValueCount()); } private LevelReader buildLevelRLEReader(int maxLevel, Slice slice) { if (maxLevel == 0) { return new LevelNullReader(); } return new LevelRLEReader(new RunLengthBitPackingHybridDecoder(BytesUtils.getWidthFromMaxInt(maxLevel), new ByteArrayInputStream(slice.getBytes()))); } private ValuesReader initDataReader(ParquetEncoding dataEncoding, byte[] bytes, int offset, int valueCount) { ValuesReader valuesReader; if (dataEncoding.usesDictionary()) { if (dictionary == null) { throw new ParquetDecodingException("Dictionary is missing for Page"); } valuesReader = dataEncoding.getDictionaryBasedValuesReader(columnDescriptor, VALUES, dictionary); } else { valuesReader = dataEncoding.getValuesReader(columnDescriptor, VALUES); } try { valuesReader.initFromPage(valueCount, bytes, offset); return valuesReader; } catch (IOException e) { throw new ParquetDecodingException("Error reading parquet page in column " + columnDescriptor, e); } } }
/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.presto.hive.parquet; import com.facebook.presto.hive.HiveColumnHandle; import com.facebook.presto.parquet.Field; import com.facebook.presto.parquet.ParquetCorruptionException; import com.facebook.presto.parquet.reader.ParquetReader; import com.facebook.presto.spi.ConnectorPageSource; import com.facebook.presto.spi.Page; import com.facebook.presto.spi.PrestoException; import com.facebook.presto.spi.block.Block; import com.facebook.presto.spi.block.LazyBlock; import com.facebook.presto.spi.block.LazyBlockLoader; import com.facebook.presto.spi.block.RunLengthEncodedBlock; import com.facebook.presto.spi.predicate.TupleDomain; import com.facebook.presto.spi.type.Type; import com.facebook.presto.spi.type.TypeManager; import; import; import parquet.schema.MessageType; import; import; import java.util.List; import java.util.Optional; import java.util.Properties; import static com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR; import static com.facebook.presto.hive.HiveErrorCode.HIVE_BAD_DATA; import static com.facebook.presto.hive.HiveErrorCode.HIVE_CURSOR_ERROR; import static com.facebook.presto.hive.parquet.ParquetPageSourceFactory.getParquetType; import static com.facebook.presto.parquet.ParquetTypeUtils.getFieldIndex; import static com.facebook.presto.parquet.ParquetTypeUtils.lookupColumnByName; import static; import static java.util.Objects.requireNonNull; import static; public class ParquetPageSource implements ConnectorPageSource { private static final int MAX_VECTOR_LENGTH = 1024; private final ParquetReader parquetReader; private final MessageType fileSchema; // for debugging heap dump private final List<String> columnNames; private final List<Type> types; private final List<Optional<Field>> fields; private final Block[] constantBlocks; private final int[] hiveColumnIndexes; private int batchId; private boolean closed; private long readTimeNanos; private final boolean useParquetColumnNames; public ParquetPageSource( ParquetReader parquetReader, MessageType fileSchema, MessageColumnIO messageColumnIO, TypeManager typeManager, Properties splitSchema, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, boolean useParquetColumnNames) { requireNonNull(splitSchema, "splitSchema is null"); requireNonNull(columns, "columns is null"); requireNonNull(effectivePredicate, "effectivePredicate is null"); this.parquetReader = requireNonNull(parquetReader, "parquetReader is null"); this.fileSchema = requireNonNull(fileSchema, "fileSchema is null"); this.useParquetColumnNames = useParquetColumnNames; int size = columns.size(); this.constantBlocks = new Block[size]; this.hiveColumnIndexes = new int[size]; ImmutableList.Builder<String> namesBuilder = ImmutableList.builder(); ImmutableList.Builder<Type> typesBuilder = ImmutableList.builder(); ImmutableList.Builder<Optional<Field>> fieldsBuilder = ImmutableList.builder(); for (int columnIndex = 0; columnIndex < size; columnIndex++) { HiveColumnHandle column = columns.get(columnIndex); checkState(column.getColumnType() == REGULAR, "column type must be regular"); String name = column.getName(); Type type = typeManager.getType(column.getTypeSignature()); namesBuilder.add(name); typesBuilder.add(type); hiveColumnIndexes[columnIndex] = column.getHiveColumnIndex(); if (getParquetType(column, fileSchema, useParquetColumnNames) == null) { constantBlocks[columnIndex] = RunLengthEncodedBlock.create(type, null, MAX_VECTOR_LENGTH); fieldsBuilder.add(Optional.empty()); } else { String columnName = useParquetColumnNames ? name : fileSchema.getFields().get(column.getHiveColumnIndex()).getName(); fieldsBuilder.add(constructField(type, lookupColumnByName(messageColumnIO, columnName))); } } types =; fields =; columnNames =; } @Override public long getCompletedBytes() { return parquetReader.getDataSource().getReadBytes(); } @Override public long getReadTimeNanos() { return readTimeNanos; } @Override public boolean isFinished() { return closed; } @Override public long getSystemMemoryUsage() { return parquetReader.getSystemMemoryContext().getBytes(); } @Override public Page getNextPage() { try { batchId++; long start = System.nanoTime(); int batchSize = parquetReader.nextBatch(); readTimeNanos += System.nanoTime() - start; if (closed || batchSize <= 0) { close(); return null; } Block[] blocks = new Block[hiveColumnIndexes.length]; for (int fieldId = 0; fieldId < blocks.length; fieldId++) { if (constantBlocks[fieldId] != null) { blocks[fieldId] = constantBlocks[fieldId].getRegion(0, batchSize); } else { Type type = types.get(fieldId); Optional<Field> field = fields.get(fieldId); int fieldIndex; if (useParquetColumnNames) { fieldIndex = getFieldIndex(fileSchema, columnNames.get(fieldId)); } else { fieldIndex = hiveColumnIndexes[fieldId]; } if (fieldIndex != -1 && field.isPresent()) { blocks[fieldId] = new LazyBlock(batchSize, new ParquetBlockLoader(field.get())); } else { blocks[fieldId] = RunLengthEncodedBlock.create(type, null, batchSize); } } } return new Page(batchSize, blocks); } catch (PrestoException e) { closeWithSuppression(e); throw e; } catch (RuntimeException e) { closeWithSuppression(e); throw new PrestoException(HIVE_CURSOR_ERROR, e); } } private void closeWithSuppression(Throwable throwable) { requireNonNull(throwable, "throwable is null"); try { close(); } catch (RuntimeException e) { // Self-suppression not permitted if (e != throwable) { throwable.addSuppressed(e); } } } @Override public void close() { if (closed) { return; } closed = true; try { parquetReader.close(); } catch (IOException e) { throw new UncheckedIOException(e); } } private final class ParquetBlockLoader implements LazyBlockLoader<LazyBlock> { private final int expectedBatchId = batchId; private final Field field; private boolean loaded; public ParquetBlockLoader(Field field) { this.field = requireNonNull(field, "field is null"); } @Override public final void load(LazyBlock lazyBlock) { if (loaded) { return; } checkState(batchId == expectedBatchId); try { Block block = parquetReader.readBlock(field); lazyBlock.setBlock(block); } catch (ParquetCorruptionException e) { throw new PrestoException(HIVE_BAD_DATA, e); } catch (IOException e) { throw new PrestoException(HIVE_CURSOR_ERROR, e); } loaded = true; } } }