Uploaded image for project: 'Spark'
  1. Spark
  2. SPARK-33507 Improve and fix cache behavior in v1 and v2
  3. SPARK-33941

ALTER TABLE .. DROP PARTITION doesn't invalidate the cache

    XMLWordPrintableJSON

Details

    • Sub-task
    • Status: In Progress
    • Major
    • Resolution: Unresolved
    • 2.4.7, 3.0.1, 3.1.0, 3.2.0
    • None
    • SQL
    • None

    Description

      1. Create a partitioned table:

      scala> sql("CREATE TABLE tbl2 (id int, part int) USING parquet PARTITIONED BY (part)")
      res31: org.apache.spark.sql.DataFrame = []
      

      2. Create partitions:

      scala> sql("INSERT INTO tbl2 PARTITION (part=0) SELECT 0")
      res32: org.apache.spark.sql.DataFrame = []
      
      scala> sql("INSERT INTO tbl2 PARTITION (part=1) SELECT 1")
      res33: org.apache.spark.sql.DataFrame = []
      

      3. Create dataframes from the table:

      scala> val df1 = spark.table("tbl2")
      df1: org.apache.spark.sql.DataFrame = [id: int, part: int]
      
      scala> val df2 = spark.table("tbl2")
      df2: org.apache.spark.sql.DataFrame = [id: int, part: int]
      

      4. Cache df2 and fill in the cache:

      scala> df2.cache
      res34: df2.type = [id: int, part: int]
      
      scala> df1.show(false)
      +---+----+
      |id |part|
      +---+----+
      |0  |0   |
      |1  |1   |
      +---+----+
      
      
      scala> df2.show(false)
      +---+----+
      |id |part|
      +---+----+
      |0  |0   |
      |1  |1   |
      +---+----+
      

      5. Drop a partition:

      scala> sql("ALTER TABLE tbl2 DROP PARTITION(part=0)")
      res37: org.apache.spark.sql.DataFrame = []
      
      scala> df1.show(false)
      +---+----+
      |id |part|
      +---+----+
      |0  |0   |
      |1  |1   |
      +---+----+
      
      
      scala> df2.show(false)
      +---+----+
      |id |part|
      +---+----+
      |0  |0   |
      |1  |1   |
      +---+----+
      

      The same without caching:

      scala> sql("CREATE TABLE tbl3 (id int, part int) USING parquet PARTITIONED BY (part)")
      res40: org.apache.spark.sql.DataFrame = []
      
      scala> sql("INSERT INTO tbl3 PARTITION (part=0) SELECT 0")
      res41: org.apache.spark.sql.DataFrame = []
      
      scala> sql("INSERT INTO tbl3 PARTITION (part=1) SELECT 1")
      res42: org.apache.spark.sql.DataFrame = []
      
      scala> val df3 = spark.table("tbl3")
      df3: org.apache.spark.sql.DataFrame = [id: int, part: int]
      
      scala> df3.show(false)
      +---+----+
      |id |part|
      +---+----+
      |0  |0   |
      |1  |1   |
      +---+----+
      
      
      scala> sql("ALTER TABLE tbl3 DROP PARTITION(part=0)")
      res44: org.apache.spark.sql.DataFrame = []
      
      scala> df3.show(false)
      +---+----+
      |id |part|
      +---+----+
      |1  |1   |
      +---+----+
      

      Attachments

        Issue Links

          Activity

            People

              Unassigned Unassigned
              maxgekk Max Gekk
              Votes:
              0 Vote for this issue
              Watchers:
              2 Start watching this issue

              Dates

                Created:
                Updated: