@@ -21,7 +21,7 @@ import org.apache.hadoop.fs.Path
2121import org .apache .spark .sql .{AnalysisException , DataFrame , QueryTest , Row }
2222import org .apache .spark .sql .catalyst .plans .logical .{LogicalPlan , Project }
2323import org .apache .spark .sql .execution .SortExec
24- import org .apache .spark .sql .execution .datasources .{FileIndex , HadoopFsRelation , InMemoryFileIndex , LogicalRelation }
24+ import org .apache .spark .sql .execution .datasources .{BucketingUtils , FileIndex , HadoopFsRelation , InMemoryFileIndex , LogicalRelation }
2525import org .apache .spark .sql .execution .exchange .ShuffleExchangeExec
2626
2727import com .microsoft .hyperspace .{Hyperspace , Implicits , SampleData , TestConfig , TestUtils }
@@ -30,6 +30,7 @@ import com.microsoft.hyperspace.index.IndexLogEntryTags._
3030import com .microsoft .hyperspace .index .execution .BucketUnionStrategy
3131import com .microsoft .hyperspace .index .plans .logical .IndexHadoopFsRelation
3232import com .microsoft .hyperspace .index .rules .{FilterIndexRule , JoinIndexRule }
33+ import com .microsoft .hyperspace .util .LogicalPlanUtils .BucketSelector
3334import com .microsoft .hyperspace .util .PathUtils
3435
3536class E2EHyperspaceRulesTest extends QueryTest with HyperspaceSuite {
@@ -112,7 +113,11 @@ class E2EHyperspaceRulesTest extends QueryTest with HyperspaceSuite {
112113
113114 def query (): DataFrame = df.filter(" c3 == 'facebook'" ).select(" c3" , " c1" )
114115
115- verifyIndexUsage(query, getIndexFilesPath(indexConfig.indexName))
116+ verifyIndexUsage(
117+ query,
118+ getIndexFilesPathWithBucketSelector(
119+ query().queryExecution.optimizedPlan,
120+ indexConfig.indexName))
116121 }
117122 }
118123 }
@@ -128,7 +133,11 @@ class E2EHyperspaceRulesTest extends QueryTest with HyperspaceSuite {
128133 def query (): DataFrame = df.filter(" C3 == 'facebook'" ).select(" C3" , " c1" )
129134
130135 // Verify if case-insensitive index works with case-insensitive query.
131- verifyIndexUsage(query, getIndexFilesPath(indexConfig.indexName))
136+ verifyIndexUsage(
137+ query,
138+ getIndexFilesPathWithBucketSelector(
139+ query().queryExecution.optimizedPlan,
140+ indexConfig.indexName))
132141 }
133142
134143 test(" E2E test for case sensitive filter query where changing conf changes behavior." ) {
@@ -145,7 +154,11 @@ class E2EHyperspaceRulesTest extends QueryTest with HyperspaceSuite {
145154 }
146155
147156 withSQLConf(" spark.sql.caseSensitive" -> " false" ) {
148- verifyIndexUsage(query, getIndexFilesPath(indexConfig.indexName))
157+ verifyIndexUsage(
158+ query,
159+ getIndexFilesPathWithBucketSelector(
160+ query().queryExecution.optimizedPlan,
161+ indexConfig.indexName))
149162 }
150163 }
151164
@@ -165,9 +178,12 @@ class E2EHyperspaceRulesTest extends QueryTest with HyperspaceSuite {
165178 def query (): DataFrame = spark.sql(" SELECT * from t where c4 = 1" )
166179
167180 // Verify no Project node is present in the query plan, as a result of using SELECT *
168- assert(query().queryExecution.optimizedPlan.collect { case p : Project => p }.isEmpty)
181+ val queryPlan = query().queryExecution.optimizedPlan
182+ assert(queryPlan.collect { case p : Project => p }.isEmpty)
169183
170- verifyIndexUsage(query, getIndexFilesPath(indexConfig.indexName))
184+ verifyIndexUsage(
185+ query,
186+ getIndexFilesPathWithBucketSelector(queryPlan, indexConfig.indexName))
171187 }
172188 }
173189 }
@@ -388,10 +404,11 @@ class E2EHyperspaceRulesTest extends QueryTest with HyperspaceSuite {
388404
389405 spark.enableHyperspace()
390406 val dfWithHyperspaceEnabled = query(df)
407+ val planWithHyperspaceEnabled = dfWithHyperspaceEnabled.queryExecution.optimizedPlan
391408
392409 verifyQueryPlanHasExpectedRootPaths(
393- dfWithHyperspaceEnabled.queryExecution.optimizedPlan ,
394- getIndexFilesPath( indexConfig.indexName))
410+ planWithHyperspaceEnabled ,
411+ getIndexFilesPathWithBucketSelector(planWithHyperspaceEnabled, indexConfig.indexName))
395412
396413 assert(schemaWithHyperspaceDisabled.equals(dfWithHyperspaceEnabled.schema))
397414 assert(sortedRowsWithHyperspaceDisabled.sameElements(getSortedRows(dfWithHyperspaceEnabled)))
@@ -503,7 +520,11 @@ class E2EHyperspaceRulesTest extends QueryTest with HyperspaceSuite {
503520 def query (): DataFrame =
504521 spark.read.parquet(testPath).filter(" c3 == 'facebook'" ).select(" c3" , " c1" )
505522
506- verifyIndexUsage(query, getIndexFilesPath(indexConfig.indexName))
523+ verifyIndexUsage(
524+ query,
525+ getIndexFilesPathWithBucketSelector(
526+ query().queryExecution.optimizedPlan,
527+ indexConfig.indexName))
507528
508529 // Delete some source data file.
509530 TestUtils .deleteFiles(testPath, " *parquet" , 1 )
@@ -518,7 +539,12 @@ class E2EHyperspaceRulesTest extends QueryTest with HyperspaceSuite {
518539 hyperspace.refreshIndex(indexConfig.indexName, REFRESH_MODE_INCREMENTAL )
519540
520541 // Verify index usage on latest version of index (v=1) after refresh.
521- verifyIndexUsage(query, getIndexFilesPath(indexConfig.indexName, Seq (1 )))
542+ verifyIndexUsage(
543+ query,
544+ getIndexFilesPathWithBucketSelector(
545+ query().queryExecution.optimizedPlan,
546+ indexConfig.indexName,
547+ Seq (1 )))
522548 }
523549 }
524550 }
@@ -951,6 +977,47 @@ class E2EHyperspaceRulesTest extends QueryTest with HyperspaceSuite {
951977 }
952978 }
953979
980+ test(" Verify excluding index data file path using bucket pruning." ) {
981+ withTempPathAsString { testPath =>
982+ // Setup. Create data.
983+ val indexConfig = IndexConfig (" index" , Seq (" c3" ), Seq (" c4" ))
984+ import spark .implicits ._
985+ SampleData .testData
986+ .toDF(" c1" , " c2" , " c3" , " c4" , " c5" )
987+ .limit(10 )
988+ .write
989+ .json(testPath)
990+ val df = spark.read.json(testPath)
991+
992+ // Create index.
993+ hyperspace.createIndex(df, indexConfig)
994+ spark.enableHyperspace()
995+
996+ def query (): DataFrame =
997+ df.filter(df(" c3" ) isin (Seq (" facebook" , " donde" , " miperro" ): _* )).select(" c4" , " c3" )
998+
999+ withIndex(" index" ) {
1000+ val index = TestUtils .latestIndexLogEntry(systemPath, indexConfig.indexName)
1001+ val plan = query().queryExecution.optimizedPlan
1002+ val buckets = BucketSelector (plan, index.bucketSpec)
1003+ assert(buckets.isDefined)
1004+
1005+ val locs = getFsLocation(plan)
1006+ assert(locs.size == 1 )
1007+ assert(buckets.get.cardinality() == 3 )
1008+
1009+ val indexFiles = locs.head.inputFiles
1010+ assert(indexFiles.length == buckets.get.cardinality())
1011+ assert(indexFiles.length < index.content.files.length)
1012+
1013+ val indexFilesBitIdSet = indexFiles.map(BucketingUtils .getBucketId(_).get).toSet
1014+ indexFilesBitIdSet.forall(buckets.get.get(_))
1015+ (1 to index.bucketSpec.numBuckets).forall(n =>
1016+ ! (buckets.get.get(n) ^ indexFilesBitIdSet.contains(n)))
1017+ }
1018+ }
1019+ }
1020+
9541021 /**
9551022 * Verify that the query plan has the expected rootPaths.
9561023 *
@@ -980,6 +1047,19 @@ class E2EHyperspaceRulesTest extends QueryTest with HyperspaceSuite {
9801047 }.flatten
9811048 }
9821049
1050+ private def getIndexFilesPathWithBucketSelector (
1051+ plan : LogicalPlan ,
1052+ indexName : String ,
1053+ versions : Seq [Int ] = Seq (0 )): Seq [Path ] = {
1054+ val paths = getIndexFilesPath(indexName, versions)
1055+ BucketSelector (plan, TestUtils .latestIndexLogEntry(systemPath, indexName).bucketSpec) match {
1056+ case Some (buckets) =>
1057+ paths.filter(f => buckets.get(BucketingUtils .getBucketId(f.getName).get))
1058+ case None =>
1059+ paths
1060+ }
1061+ }
1062+
9831063 private def getIndexFilesPath (indexName : String , versions : Seq [Int ] = Seq (0 )): Seq [Path ] = {
9841064 versions.flatMap { v =>
9851065 Content
0 commit comments