-
Notifications
You must be signed in to change notification settings - Fork 4.8k
HIVE-28755: Statistics Management Task #6438
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 6 commits
e1841cf
94be1ff
1193c11
0d05096
ddcf54b
73d7606
d2465ee
65ad54f
86f2af4
85868ce
272005c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1291,6 +1291,22 @@ public enum ConfVars { | |
| "metastore.partition.management.table.pattern", "*", | ||
| "Automatic partition management will look for tables using the specified table pattern"), | ||
|
|
||
| COLUMN_STATISTICS_MANAGEMENT_TASK_FREQUENCY("metastore.column.statistics.management.task.frequency", | ||
| "metastore.column.statistics.management.task.frequency", | ||
| 7, TimeUnit.DAYS, "Frequency at which timer task runs to do automatic statistics \n" + | ||
| "management for tables. Statistics management include 2 configs. \n" + | ||
| "One is 'metastore.column.statistics.auto.deletion', and the other is 'metastore.column.statistics.retention.period'. \n" + | ||
| "When 'metastore.column.statistics.auto.deletion'='true' is set, statistics management will look for tables which their\n" + | ||
| "column statistics are over the retention period, and then delete the column stats. \n"), | ||
|
|
||
| COLUMN_STATISTICS_RETENTION_PERIOD("metastore.column.statistics.retention.period", | ||
| "metastore.column.statistics.retention.period", 365, TimeUnit.DAYS, "The retention period " + | ||
| "that we want to keep the stats for each table, which means if the stats are older than this period\n" + | ||
| "of time, the stats will be automatically deleted. \n"), | ||
|
|
||
| COLUMN_STATISTICS_AUTO_DELETION("metastore.column.statistics.auto.deletion", "metastore.column.statistics.auto.deletion", false, | ||
| "Whether table/partition column statistics will be auto deleted after retention period"), | ||
|
soumyakanti3578 marked this conversation as resolved.
Comment on lines
+1296
to
+1308
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The description is very clear, no need to change. |
||
|
|
||
| METASTORE_METADATA_TRANSFORMER_CLASS("metastore.metadata.transformer.class", "metastore.metadata.transformer.class", | ||
| "org.apache.hadoop.hive.metastore.MetastoreDefaultTransformer", | ||
| "Fully qualified class name for the metastore metadata transformer class \n" | ||
|
|
@@ -1526,7 +1542,8 @@ public enum ConfVars { | |
| ACID_METRICS_TASK_CLASS + "," + ACID_METRICS_LOGGER_CLASS + "," + | ||
| "org.apache.hadoop.hive.metastore.HiveProtoEventsCleanerTask" + "," | ||
| + "org.apache.hadoop.hive.metastore.ScheduledQueryExecutionsMaintTask" + "," | ||
| + "org.apache.hadoop.hive.metastore.ReplicationMetricsMaintTask", | ||
| + "org.apache.hadoop.hive.metastore.ReplicationMetricsMaintTask" + "," | ||
| + "org.apache.hadoop.hive.metastore.StatisticsManagementTask", | ||
|
DanielZhu58 marked this conversation as resolved.
|
||
| "Comma separated list of tasks that will be started in separate threads. These will " + | ||
| "always be started, regardless of whether the metastore is running in embedded mode " + | ||
| "or in server mode. They must implement " + METASTORE_TASK_THREAD_CLASS), | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,204 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.hadoop.hive.metastore; | ||
|
|
||
| import java.util.List; | ||
| import java.util.concurrent.TimeUnit; | ||
|
DanielZhu58 marked this conversation as resolved.
|
||
| import java.util.concurrent.locks.Lock; | ||
| import java.util.concurrent.locks.ReentrantLock; | ||
|
|
||
| import org.apache.hadoop.conf.Configuration; | ||
| import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest; | ||
| import org.apache.hadoop.hive.metastore.conf.MetastoreConf; | ||
| import org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics; | ||
| import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| import javax.jdo.PersistenceManager; | ||
| import javax.jdo.Query; | ||
|
|
||
| /** | ||
| * Statistics management task responsible for periodic auto-deletion of table and partition column | ||
| * statistics based on a configured retention interval. | ||
| * | ||
| * <p>When {@code metastore.statistics.auto.deletion} is enabled, this task scans | ||
| * {@code TAB_COL_STATS} and {@code PART_COL_STATS} for rows whose {@code lastAnalyzed} timestamp | ||
| * is older than {@code metastore.statistics.retention.period}, and deletes them. | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Acknowledged and changed. |
||
| * Individual tables may opt out by setting the table property | ||
| * {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} to any non-null value. | ||
| */ | ||
| public class StatisticsManagementTask extends ObjectStore implements MetastoreTaskThread { | ||
|
|
||
| private static final Logger LOG = LoggerFactory.getLogger(StatisticsManagementTask.class); | ||
|
|
||
| /** | ||
| * Table property key that, when present on a table, excludes it from automatic statistics | ||
| * deletion regardless of the global retention setting. | ||
| */ | ||
| public static final String STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY = | ||
| "statistics.auto.deletion.exclude"; | ||
|
|
||
| private static final Lock LOCK = new ReentrantLock(); | ||
|
|
||
| private Configuration conf; | ||
|
|
||
| @Override | ||
| public long runFrequency(TimeUnit unit) { | ||
| return MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COLUMN_STATISTICS_MANAGEMENT_TASK_FREQUENCY, unit); | ||
| } | ||
|
|
||
| @Override | ||
| public void setConf(Configuration configuration) { | ||
| this.conf = new Configuration(configuration); | ||
| super.setConf(configuration); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Acknowledged and changed. |
||
| } | ||
|
|
||
| @Override | ||
| public Configuration getConf() { | ||
| return conf; | ||
| } | ||
|
Comment on lines
+140
to
+153
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Acknowledged. I added a filter which only collect the hive table and hive partitions. We do not want to touch any impala/spark tables and partitions. |
||
|
|
||
| @Override | ||
| public void run() { | ||
| LOG.debug("Auto statistics deletion started. Cleaning up table/partition column statistics over the retention period."); | ||
| long retentionMillis = | ||
| MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COLUMN_STATISTICS_RETENTION_PERIOD, TimeUnit.MILLISECONDS); | ||
| if (retentionMillis <= 0 || !MetastoreConf.getBoolVar(conf, MetastoreConf.ConfVars.COLUMN_STATISTICS_AUTO_DELETION)) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. perform the check in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Acknowledged and changed. |
||
| LOG.info("Statistics auto deletion is set to off currently."); | ||
| return; | ||
| } | ||
| if (!LOCK.tryLock()) { | ||
| return; | ||
| } | ||
| try { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: unnecessary nested
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The inner try ensures the Phase 1 read transaction is always either committed or rolled back correctly. The outer try handles any unexpected exceptions from the entire run and guarantees the lock is always released. So it's ok to not remove them. |
||
| long now = System.currentTimeMillis(); | ||
| long lastAnalyzedThreshold = (now - retentionMillis) / 1000; | ||
| PersistenceManager pm = getPersistenceManager(); | ||
| boolean committed = false; | ||
| openTransaction(); | ||
|
DanielZhu58 marked this conversation as resolved.
Outdated
|
||
| try { | ||
| try (IMetaStoreClient msc = new HiveMetaStoreClient(conf)) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why create a HiveMetaStoreClient if we can use the method in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Acknowledged and changed. |
||
| deleteExpiredTableColStats(pm, msc, lastAnalyzedThreshold); | ||
| deleteExpiredPartitionColStats(pm, msc, lastAnalyzedThreshold); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Acknowledged and changed. |
||
| } | ||
| committed = commitTransaction(); | ||
| } finally { | ||
| if (!committed) { | ||
| rollbackTransaction(); | ||
| } | ||
| } | ||
| } catch (Exception e) { | ||
| LOG.error("Error during statistics auto deletion", e); | ||
| } finally { | ||
| LOCK.unlock(); | ||
| } | ||
| } | ||
|
Comment on lines
+215
to
+229
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same issue as the previous one. Resolved. |
||
|
|
||
| /** | ||
| * Deletes expired table-level column statistics from {@code TAB_COL_STATS}. | ||
| * Tables with the {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} property set are skipped. | ||
| * | ||
| * @param pm the JDO persistence manager to use for the query | ||
| * @param msc the metastore client used to issue delete requests | ||
| * @param lastAnalyzedThreshold epoch seconds; rows with lastAnalyzed below this value are expired | ||
| * @throws Exception if the JDO query or the delete request fails | ||
| */ | ||
| private void deleteExpiredTableColStats(PersistenceManager pm, IMetaStoreClient msc, | ||
| long lastAnalyzedThreshold) throws Exception { | ||
| Query tblQuery = null; | ||
| try { | ||
| tblQuery = pm.newQuery(MTableColumnStatistics.class); | ||
| tblQuery.setFilter("lastAnalyzed < threshold"); | ||
| tblQuery.declareParameters("long threshold"); | ||
| // partitionName does not exist on MTableColumnStatistics; omitted here | ||
| tblQuery.setResult( | ||
| "table.database.name, " | ||
| + "table.tableName, " | ||
| + "table.parameters.get(\"" + STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY + "\")"); | ||
| @SuppressWarnings("unchecked") | ||
| List<Object[]> tblRows = (List<Object[]>) tblQuery.execute(lastAnalyzedThreshold); | ||
| for (Object[] row : tblRows) { | ||
| String dbName = (String) row[0]; | ||
| String tblName = (String) row[1]; | ||
| String excludeVal = (String) row[2]; | ||
| if (excludeVal != null) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Acknowledged and changed. |
||
| LOG.info("Skipping auto deletion of table stats for {}.{} due to exclude property.", | ||
| dbName, tblName); | ||
| continue; | ||
| } | ||
| DeleteColumnStatisticsRequest request = new DeleteColumnStatisticsRequest(dbName, tblName); | ||
| request.setEngine("hive"); | ||
| request.setTableLevel(true); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Acknowledged. Made some changes and no longer using the request. |
||
| msc.deleteColumnStatistics(request); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will it delete all stats even they are not stale?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Acknowledged. I changed the implementation, we no longer use the request. |
||
| } | ||
| } finally { | ||
| if (tblQuery != null) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: if
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Query is already wrapped in try-with-resources, so close() is called automatically when the block exits. In DataNucleus's JDO implementation, close() internally delegates to closeAll(), so the behavior is equivalent. |
||
| tblQuery.closeAll(); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Deletes expired partition-level column statistics from {@code PART_COL_STATS}. | ||
| * Tables with the {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} property set are skipped. | ||
| * | ||
| * @param pm the JDO persistence manager to use for the query | ||
| * @param msc the metastore client used to issue delete requests | ||
| * @param lastAnalyzedThreshold epoch seconds; rows with lastAnalyzed below this value are expired | ||
| * @throws Exception if the JDO query or the delete request fails | ||
| */ | ||
| private void deleteExpiredPartitionColStats(PersistenceManager pm, IMetaStoreClient msc, | ||
| long lastAnalyzedThreshold) throws Exception { | ||
| Query partQuery = null; | ||
| try { | ||
| partQuery = pm.newQuery(MPartitionColumnStatistics.class); | ||
| partQuery.setFilter("lastAnalyzed < threshold"); | ||
| partQuery.declareParameters("long threshold"); | ||
| // project via partition navigation to reach partitionName and the table exclude property | ||
| partQuery.setResult( | ||
| "partition.table.database.name, " | ||
| + "partition.table.tableName, " | ||
| + "partition.partitionName, " | ||
| + "partition.table.parameters.get(\"" + STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY + "\")"); | ||
| @SuppressWarnings("unchecked") | ||
| List<Object[]> partRows = (List<Object[]>) partQuery.execute(lastAnalyzedThreshold); | ||
| for (Object[] row : partRows) { | ||
| String dbName = (String) row[0]; | ||
| String tblName = (String) row[1]; | ||
| String partName = (String) row[2]; | ||
| String excludeVal = (String) row[3]; | ||
| if (excludeVal != null) { | ||
| LOG.info("Skipping auto deletion of partition stats for {}.{} due to exclude property.", | ||
| dbName, tblName); | ||
| continue; | ||
| } | ||
| DeleteColumnStatisticsRequest request = new DeleteColumnStatisticsRequest(dbName, tblName); | ||
| request.setEngine("hive"); | ||
| request.setTableLevel(false); | ||
| request.addToPart_names(partName); | ||
| msc.deleteColumnStatistics(request); | ||
| } | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Acknowledged. Changed the implementation to delete multiple columns stats at the same time. |
||
| } finally { | ||
| if (partQuery != null) { | ||
| partQuery.closeAll(); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing newline.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Acknowledged. |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
since we can use
metastore.column.statistics.management.task.frequencyto turn on/off the management task, so I think we don't need extraCOLUMN_STATISTICS_AUTO_DELETIONto configure the taskThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's ok to keep COLUMN_STATISTICS_AUTO_DELETION. The two configs have distinct responsibilities: the boolean controls whether the feature is active at all, while the frequency controls how often it runs. Conflating them makes the configuration less readable. It can gives users a simple toggle to temporarily disable deletion without having to remember and restore their frequency setting.