Uploaded image for project: 'Spark'
  1. Spark
  2. SPARK-17396

Threads number keep increasing when query on external CSV partitioned table

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Resolved
    • Major
    • Resolution: Fixed
    • 2.0.0
    • 2.0.1, 2.1.0
    • Spark Core
    • None

    Description

      1. Create a external partitioned table row format CSV
      2. Add 16 partitions to the table
      3. Run SQL "select count from test_csv"
      4. ForkJoinThread number keep increasing
      This happend when table partitions number greater than 10.
      5. Test Code

      import org.apache.spark.SparkConf
      import org.apache.spark.SparkContext
      import org.apache.spark.sql.hive.HiveContext
      
      object Bugs {
      
        def main(args: Array[String]): Unit = {
      
          val location = "file:///g:/home/test/csv"
          val create = s"""CREATE   EXTERNAL  TABLE  test_csv
                   (ID string,  SEQ string )
                    PARTITIONED BY(index int)
                    ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
                    LOCATION "${location}" 
                """
          val add_part = s"""
        ALTER TABLE test_csv ADD 
        PARTITION (index=1)LOCATION '${location}/index=1'
        PARTITION (index=2)LOCATION '${location}/index=2'
        PARTITION (index=3)LOCATION '${location}/index=3'
        PARTITION (index=4)LOCATION '${location}/index=4'
        PARTITION (index=5)LOCATION '${location}/index=5'
        PARTITION (index=6)LOCATION '${location}/index=6'
        PARTITION (index=7)LOCATION '${location}/index=7'
        PARTITION (index=8)LOCATION '${location}/index=8'
        PARTITION (index=9)LOCATION '${location}/index=9'
        PARTITION (index=10)LOCATION '${location}/index=10'
        PARTITION (index=11)LOCATION '${location}/index=11'
        PARTITION (index=12)LOCATION '${location}/index=12'
        PARTITION (index=13)LOCATION '${location}/index=13'
        PARTITION (index=14)LOCATION '${location}/index=14'
        PARTITION (index=15)LOCATION '${location}/index=15'
        PARTITION (index=16)LOCATION '${location}/index=16'
          """
      
          val conf = new SparkConf().setAppName("scala").setMaster("local[2]")
          conf.set("spark.sql.warehouse.dir", "file:///g:/home/warehouse")
          val ctx = new SparkContext(conf)
          val hctx = new HiveContext(ctx)
          hctx.sql(create)
          hctx.sql(add_part)
           for (i <- 1 to 6) {
            new Query(hctx).start()
          }
        }
      
        class Query(htcx: HiveContext) extends Thread {
      
          setName("Query-Thread")
      
          override def run = {
            while (true) {
              htcx.sql("select count(*) from test_csv").show()
              Thread.sleep(100)
            }
      
          }
        }
      }
      

      Attachments

        Issue Links

          Activity

            People

              rdblue Ryan Blue
              pin_zhang pin_zhang
              Votes:
              0 Vote for this issue
              Watchers:
              4 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: