Description
When there are multiple leading eigen values of the normalized affinity matrix, power iteration clustering gives incorrect results.
We should either give an error or warning to the user when PIC doesn't converges ( ie.
when |\lambda_1/\lambda_2| = 1 )
test("Fail to converge: Multiple leading eigen values") { /* Graph: 2 / / 1 3 - - 4 Adjacency matrix: [(0, 1, 0, 0), (1, 0, 0, 0), A = (0, 0, 0, 1), (0, 0, 1, 0)] */ val data = Seq[(Long, Long, Double)]( (1, 2, 1.0), (3, 4, 1.0) ).toDF("src", "dst", "weight") val result = new PowerIterationClustering() .setK(2) .setMaxIter(20) .setInitMode("random") .setWeightCol("weight") .assignClusters(data) .select('id, 'cluster) val predictions = Array.fill(2)(mutable.Set.empty[Long]) result.collect().foreach { case Row(id: Long, cluster: Integer) => predictions(cluster) += id } assert(predictions.toSet == Set(Array(1, 2).toSet, Array(3, 4).toSet)) }