Uploaded image for project: 'Spark'
  1. Spark
  2. SPARK-30503

OnlineLDAOptimizer does not handle persistance correctly

    XMLWordPrintableJSON

    Details

    • Type: Bug
    • Status: Resolved
    • Priority: Minor
    • Resolution: Fixed
    • Affects Version/s: 2.4.4, 3.0.0
    • Fix Version/s: 3.0.0
    • Component/s: GraphX, ML
    • Labels:
      None

      Description

      It seems that in OnlineLDAOptimizer, PeriodicGraphCheckpointer can not unpersit edges correctly.

      scala> import org.apache.spark.ml.clustering.LDA
      import org.apache.spark.ml.clustering.LDA
      
      scala> val dataset = spark.read.format("libsvm").load("data/mllib/sample_lda_libsvm_data.txt")
      20/01/13 20:00:30 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan. dataset: org.apache.spark.sql.DataFrame = [label: double, features: vector]
      
      scala> val lda = new LDA().setK(10).setMaxIter(100).setOptimizer("em")
      lda: org.apache.spark.ml.clustering.LDA = lda_0e9a6cf09801
      
      scala> sc.getPersistentRDDs
      res0: scala.collection.Map[Int,org.apache.spark.rdd.RDD[_]] = Map()
      
      scala> val model = lda.fit(dataset)
      model: org.apache.spark.ml.clustering.LDAModel = DistributedLDAModel: uid=lda_0e9a6cf09801, k=10, numFeatures=11
      
      scala> sc.getPersistentRDDs
      res1: scala.collection.Map[Int,org.apache.spark.rdd.RDD[_]] = Map(809 -> EdgeRDD MapPartitionsRDD[809] at mapPartitions at EdgeRDDImpl.scala:119, 1337 -> EdgeRDD MapPartitionsRDD[1337] at mapPartitions at EdgeRDDImpl.scala:119, 977 -> EdgeRDD MapPartitionsRDD[977] at mapPartitions at EdgeRDDImpl.scala:119, 1073 -> EdgeRDD MapPartitionsRDD[1073] at mapPartitions at EdgeRDDImpl.scala:119, 449 -> EdgeRDD MapPartitionsRDD[449] at mapPartitions at EdgeRDDImpl.scala:119, 1793 -> EdgeRDD MapPartitionsRDD[1793] at mapPartitions at EdgeRDDImpl.scala:119, 185 -> EdgeRDD MapPartitionsRDD[185] at mapPartitions at EdgeRDDImpl.scala:119, 1001 -> EdgeRDD MapPartitionsRDD[1001] at mapPartitions at EdgeRDDImpl.scala:119, 1601 -> EdgeRDD MapPartitionsRDD[1601] at mapPartitions a...
      
      scala> sc.getPersistentRDDs.size
      res2: Int = 106
      
      scala> sc.getPersistentRDDs.foreach(println)
      (809,EdgeRDD MapPartitionsRDD[809] at mapPartitions at EdgeRDDImpl.scala:119)
      (1337,EdgeRDD MapPartitionsRDD[1337] at mapPartitions at EdgeRDDImpl.scala:119)
      (977,EdgeRDD MapPartitionsRDD[977] at mapPartitions at EdgeRDDImpl.scala:119)
      (1073,EdgeRDD MapPartitionsRDD[1073] at mapPartitions at EdgeRDDImpl.scala:119)
      (449,EdgeRDD MapPartitionsRDD[449] at mapPartitions at EdgeRDDImpl.scala:119)
      (1793,EdgeRDD MapPartitionsRDD[1793] at mapPartitions at EdgeRDDImpl.scala:119)
      (185,EdgeRDD MapPartitionsRDD[185] at mapPartitions at EdgeRDDImpl.scala:119)
      (1001,EdgeRDD MapPartitionsRDD[1001] at mapPartitions at EdgeRDDImpl.scala:119)
      (1601,EdgeRDD MapPartitionsRDD[1601] at mapPartitions at EdgeRDDImpl.scala:119)
      (1529,EdgeRDD MapPartitionsRDD[1529] at mapPartitions at EdgeRDDImpl.scala:119)
      (1265,EdgeRDD MapPartitionsRDD[1265] at mapPartitions at EdgeRDDImpl.scala:119)
      (257,EdgeRDD MapPartitionsRDD[257] at mapPartitions at EdgeRDDImpl.scala:119)
      (1409,EdgeRDD MapPartitionsRDD[1409] at mapPartitions at EdgeRDDImpl.scala:119)
      (1985,EdgeRDD MapPartitionsRDD[1985] at mapPartitions at EdgeRDDImpl.scala:119)
      (785,EdgeRDD MapPartitionsRDD[785] at mapPartitions at EdgeRDDImpl.scala:119)
      (1313,EdgeRDD MapPartitionsRDD[1313] at mapPartitions at EdgeRDDImpl.scala:119)
      (1577,EdgeRDD MapPartitionsRDD[1577] at mapPartitions at EdgeRDDImpl.scala:119)
      (881,EdgeRDD MapPartitionsRDD[881] at mapPartitions at EdgeRDDImpl.scala:119)
      (29,VertexRDD, VertexRDD ZippedPartitionsRDD2[29] at zipPartitions at VertexRDD.scala:322)
      (2105,EdgeRDD MapPartitionsRDD[2105] at mapPartitions at EdgeRDDImpl.scala:119)
      (353,EdgeRDD MapPartitionsRDD[353] at mapPartitions at EdgeRDDImpl.scala:119)
      (905,EdgeRDD MapPartitionsRDD[905] at mapPartitions at EdgeRDDImpl.scala:119)
      (1169,EdgeRDD MapPartitionsRDD[1169] at mapPartitions at EdgeRDDImpl.scala:119)
      (89,EdgeRDD MapPartitionsRDD[89] at mapPartitions at EdgeRDDImpl.scala:119)
      (1433,EdgeRDD MapPartitionsRDD[1433] at mapPartitions at EdgeRDDImpl.scala:119)
      (1697,EdgeRDD MapPartitionsRDD[1697] at mapPartitions at EdgeRDDImpl.scala:119)
      (233,EdgeRDD MapPartitionsRDD[233] at mapPartitions at EdgeRDDImpl.scala:119)
      (761,EdgeRDD MapPartitionsRDD[761] at mapPartitions at EdgeRDDImpl.scala:119)
      (2441,EdgeRDD MapPartitionsRDD[2441] at mapPartitions at EdgeRDDImpl.scala:119)
      (2249,EdgeRDD MapPartitionsRDD[2249] at mapPartitions at EdgeRDDImpl.scala:119)
      (1217,EdgeRDD MapPartitionsRDD[1217] at mapPartitions at EdgeRDDImpl.scala:119)
      (137,EdgeRDD MapPartitionsRDD[137] at mapPartitions at EdgeRDDImpl.scala:119)
      (2414,VertexRDD, VertexRDD ZippedPartitionsRDD2[2414] at zipPartitions at VertexRDD.scala:322)
      (65,EdgeRDD MapPartitionsRDD[65] at mapPartitions at EdgeRDDImpl.scala:119)
      (329,EdgeRDD MapPartitionsRDD[329] at mapPartitions at EdgeRDDImpl.scala:119)
      (665,EdgeRDD MapPartitionsRDD[665] at mapPartitions at EdgeRDDImpl.scala:119)
      (1457,EdgeRDD MapPartitionsRDD[1457] at mapPartitions at EdgeRDDImpl.scala:119)
      (2345,EdgeRDD MapPartitionsRDD[2345] at mapPartitions at EdgeRDDImpl.scala:119)
      (1121,EdgeRDD MapPartitionsRDD[1121] at mapPartitions at EdgeRDDImpl.scala:119)
      (593,EdgeRDD MapPartitionsRDD[593] at mapPartitions at EdgeRDDImpl.scala:119)
      (857,EdgeRDD MapPartitionsRDD[857] at mapPartitions at EdgeRDDImpl.scala:119)
      (1361,EdgeRDD MapPartitionsRDD[1361] at mapPartitions at EdgeRDDImpl.scala:119)
      (1937,EdgeRDD MapPartitionsRDD[1937] at mapPartitions at EdgeRDDImpl.scala:119)
      (1889,EdgeRDD MapPartitionsRDD[1889] at mapPartitions at EdgeRDDImpl.scala:119)
      (2153,EdgeRDD MapPartitionsRDD[2153] at mapPartitions at EdgeRDDImpl.scala:119)
      (569,EdgeRDD MapPartitionsRDD[569] at mapPartitions at EdgeRDDImpl.scala:119)
      (1241,EdgeRDD MapPartitionsRDD[1241] at mapPartitions at EdgeRDDImpl.scala:119)
      (2057,EdgeRDD MapPartitionsRDD[2057] at mapPartitions at EdgeRDDImpl.scala:119)
      (953,EdgeRDD MapPartitionsRDD[953] at mapPartitions at EdgeRDDImpl.scala:119)
      (425,EdgeRDD MapPartitionsRDD[425] at mapPartitions at EdgeRDDImpl.scala:119)
      (2033,EdgeRDD MapPartitionsRDD[2033] at mapPartitions at EdgeRDDImpl.scala:119)
      (32,EdgeRDD MapPartitionsRDD[32] at mapPartitions at EdgeRDDImpl.scala:119)
      (161,EdgeRDD MapPartitionsRDD[161] at mapPartitions at EdgeRDDImpl.scala:119)
      (689,EdgeRDD MapPartitionsRDD[689] at mapPartitions at EdgeRDDImpl.scala:119)
      (2225,EdgeRDD MapPartitionsRDD[2225] at mapPartitions at EdgeRDDImpl.scala:119)
      (2393,EdgeRDD MapPartitionsRDD[2393] at mapPartitions at EdgeRDDImpl.scala:119)
      (281,EdgeRDD MapPartitionsRDD[281] at mapPartitions at EdgeRDDImpl.scala:119)
      (545,EdgeRDD MapPartitionsRDD[545] at mapPartitions at EdgeRDDImpl.scala:119)
      (641,EdgeRDD MapPartitionsRDD[641] at mapPartitions at EdgeRDDImpl.scala:119)
      (713,EdgeRDD MapPartitionsRDD[713] at mapPartitions at EdgeRDDImpl.scala:119)
      (1865,EdgeRDD MapPartitionsRDD[1865] at mapPartitions at EdgeRDDImpl.scala:119)
      (113,EdgeRDD MapPartitionsRDD[113] at mapPartitions at EdgeRDDImpl.scala:119)
      (377,EdgeRDD MapPartitionsRDD[377] at mapPartitions at EdgeRDDImpl.scala:119)
      (737,EdgeRDD MapPartitionsRDD[737] at mapPartitions at EdgeRDDImpl.scala:119)
      (2129,EdgeRDD MapPartitionsRDD[2129] at mapPartitions at EdgeRDDImpl.scala:119)
      (521,EdgeRDD MapPartitionsRDD[521] at mapPartitions at EdgeRDDImpl.scala:119)
      (1841,EdgeRDD MapPartitionsRDD[1841] at mapPartitions at EdgeRDDImpl.scala:119)
      (2369,EdgeRDD MapPartitionsRDD[2369] at mapPartitions at EdgeRDDImpl.scala:119)
      (2390,VertexRDD, VertexRDD ZippedPartitionsRDD2[2390] at zipPartitions at VertexRDD.scala:322)
      (473,EdgeRDD MapPartitionsRDD[473] at mapPartitions at EdgeRDDImpl.scala:119)
      (209,EdgeRDD MapPartitionsRDD[209] at mapPartitions at EdgeRDDImpl.scala:119)
      (617,EdgeRDD MapPartitionsRDD[617] at mapPartitions at EdgeRDDImpl.scala:119)
      (1145,EdgeRDD MapPartitionsRDD[1145] at mapPartitions at EdgeRDDImpl.scala:119)
      (1049,EdgeRDD MapPartitionsRDD[1049] at mapPartitions at EdgeRDDImpl.scala:119)
      (1961,EdgeRDD MapPartitionsRDD[1961] at mapPartitions at EdgeRDDImpl.scala:119)
      (1025,EdgeRDD MapPartitionsRDD[1025] at mapPartitions at EdgeRDDImpl.scala:119)
      (497,EdgeRDD MapPartitionsRDD[497] at mapPartitions at EdgeRDDImpl.scala:119)
      (1649,EdgeRDD MapPartitionsRDD[1649] at mapPartitions at EdgeRDDImpl.scala:119)
      (1553,EdgeRDD MapPartitionsRDD[1553] at mapPartitions at EdgeRDDImpl.scala:119)
      (1817,EdgeRDD MapPartitionsRDD[1817] at mapPartitions at EdgeRDDImpl.scala:119)
      (1913,EdgeRDD MapPartitionsRDD[1913] at mapPartitions at EdgeRDDImpl.scala:119)
      (1289,EdgeRDD MapPartitionsRDD[1289] at mapPartitions at EdgeRDDImpl.scala:119)
      (1385,EdgeRDD MapPartitionsRDD[1385] at mapPartitions at EdgeRDDImpl.scala:119)
      (1721,EdgeRDD MapPartitionsRDD[1721] at mapPartitions at EdgeRDDImpl.scala:119)
      (2273,EdgeRDD MapPartitionsRDD[2273] at mapPartitions at EdgeRDDImpl.scala:119)
      (1481,EdgeRDD MapPartitionsRDD[1481] at mapPartitions at EdgeRDDImpl.scala:119)
      (1745,EdgeRDD MapPartitionsRDD[1745] at mapPartitions at EdgeRDDImpl.scala:119)
      (401,EdgeRDD MapPartitionsRDD[401] at mapPartitions at EdgeRDDImpl.scala:119)
      (2009,EdgeRDD MapPartitionsRDD[2009] at mapPartitions at EdgeRDDImpl.scala:119)
      (2081,EdgeRDD MapPartitionsRDD[2081] at mapPartitions at EdgeRDDImpl.scala:119)
      (929,EdgeRDD MapPartitionsRDD[929] at mapPartitions at EdgeRDDImpl.scala:119)
      (1193,EdgeRDD MapPartitionsRDD[1193] at mapPartitions at EdgeRDDImpl.scala:119)
      (833,EdgeRDD MapPartitionsRDD[833] at mapPartitions at EdgeRDDImpl.scala:119)
      (36,EdgeRDD MapPartitionsRDD[36] at mapPartitionsWithIndex at GraphImpl.scala:106)
      (1097,EdgeRDD MapPartitionsRDD[1097] at mapPartitions at EdgeRDDImpl.scala:119)
      (1625,EdgeRDD MapPartitionsRDD[1625] at mapPartitions at EdgeRDDImpl.scala:119)
      (1673,EdgeRDD MapPartitionsRDD[1673] at mapPartitions at EdgeRDDImpl.scala:119)
      (305,EdgeRDD MapPartitionsRDD[305] at mapPartitions at EdgeRDDImpl.scala:119)
      (2201,EdgeRDD MapPartitionsRDD[2201] at mapPartitions at EdgeRDDImpl.scala:119)
      (2417,EdgeRDD MapPartitionsRDD[2417] at mapPartitions at EdgeRDDImpl.scala:119)
      (1505,EdgeRDD MapPartitionsRDD[1505] at mapPartitions at EdgeRDDImpl.scala:119)
      (2321,EdgeRDD MapPartitionsRDD[2321] at mapPartitions at EdgeRDDImpl.scala:119)
      (2438,VertexRDD, VertexRDD ZippedPartitionsRDD2[2438] at zipPartitions at VertexRDD.scala:322)
      (2297,EdgeRDD MapPartitionsRDD[2297] at mapPartitions at EdgeRDDImpl.scala:119)
      (1769,EdgeRDD MapPartitionsRDD[1769] at mapPartitions at EdgeRDDImpl.scala:119)
      (2177,EdgeRDD MapPartitionsRDD[2177] at mapPartitions at EdgeRDDImpl.scala:119)
       

        Attachments

          Issue Links

            Activity

              People

              • Assignee:
                podongfeng zhengruifeng
                Reporter:
                podongfeng zhengruifeng
              • Votes:
                0 Vote for this issue
                Watchers:
                1 Start watching this issue

                Dates

                • Created:
                  Updated:
                  Resolved: