Uploaded image for project: 'Apache Tez'
  1. Apache Tez
  2. TEZ-1223

Shuffle errors at 10 TB scale

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Open
    • Major
    • Resolution: Unresolved
    • None
    • None
    • None

    Description

      When running a job with the following DAG at 10 TB scale, different shuffle exceptions occurred. Creating this as umbrella ticket for tracking these errors. Most of them are related to ShuffleHeader parsing.

      DAG:
      =====
      digraph rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1

      { graph [ label="rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1", fontsize=24, fontname=Helvetica]; node [fontsize=12, fontname=Helvetica]; edge [fontsize=9, fontcolor=blue, fontname=Arial]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_5" [ label = "Map_5[MapTezProcessor]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_5" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ label = "[input=OnFileUnorderedKVOutput,\n output=ShuffledUnorderedKVInput,\n dataMovement=BROADCAST,\n schedulingType=SEQUENTIAL]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_9" [ label = "Reducer_9[ReduceTezProcessor]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_9" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ label = "[input=OnFileSortedOutput,\n output=ShuffledMergedInputLegacy,\n dataMovement=SCATTER_GATHER,\n schedulingType=SEQUENTIAL]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_11_store_returns" [ label = "Map_11[store_returns]", shape = "box" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_11_store_returns" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_11" [ label = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_4_out_Reducer_4" [ label = "Reducer_4[out_Reducer_4]", shape = "box" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_10" [ label = "Map_10[MapTezProcessor]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_10" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ label = "[input=OnFileUnorderedKVOutput,\n output=ShuffledUnorderedKVInput,\n dataMovement=BROADCAST,\n schedulingType=SEQUENTIAL]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_8" [ label = "Map_8[MapTezProcessor]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_8" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_9" [ label = "[input=OnFileSortedOutput,\n output=ShuffledMergedInputLegacy,\n dataMovement=SCATTER_GATHER,\n schedulingType=SEQUENTIAL]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_1" [ label = "Map_1[MapTezProcessor]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_1" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ label = "[input=OnFileUnorderedKVOutput,\n output=ShuffledUnorderedKVInput,\n dataMovement=BROADCAST,\n schedulingType=SEQUENTIAL]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_6" [ label = "Map_6[MapTezProcessor]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_6" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ label = "[input=OnFileUnorderedKVOutput,\n output=ShuffledUnorderedKVInput,\n dataMovement=BROADCAST,\n schedulingType=SEQUENTIAL]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_10_item" [ label = "Map_10[item]", shape = "box" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_10_item" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_10" [ label = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_6_d3" [ label = "Map_6[d3]", shape = "box" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_6_d3" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_6" [ label = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_2_catalog_sales" [ label = "Map_2[catalog_sales]", shape = "box" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_2_catalog_sales" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_2" [ label = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_2" [ label = "Map_2[MapTezProcessor]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_2" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ label = "[input=OnFileSortedOutput,\n output=ShuffledMergedInputLegacy,\n dataMovement=SCATTER_GATHER,\n schedulingType=SEQUENTIAL]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ label = "Reducer_3[ReduceTezProcessor]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_4" [ label = "[input=OnFileSortedOutput,\n output=ShuffledMergedInputLegacy,\n dataMovement=SCATTER_GATHER,\n schedulingType=SEQUENTIAL]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_8_store_sales" [ label = "Map_8[store_sales]", shape = "box" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_8_store_sales" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_8" [ label = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_7_store" [ label = "Map_7[store]", shape = "box" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_7_store" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_7" [ label = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_5_d2" [ label = "Map_5[d2]", shape = "box" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_5_d2" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_5" [ label = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_4" [ label = "Reducer_4[ReduceTezProcessor]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_4" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_4_out_Reducer_4" [ label = "Output [outputClass=MROutput,\n initializer=]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_11" [ label = "Map_11[MapTezProcessor]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_11" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_9" [ label = "[input=OnFileSortedOutput,\n output=ShuffledMergedInputLegacy,\n dataMovement=SCATTER_GATHER,\n schedulingType=SEQUENTIAL]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_1_d1" [ label = "Map_1[d1]", shape = "box" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_1_d1" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_1" [ label = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_7" [ label = "Map_7[MapTezProcessor]" ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_7" -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ label = "[input=OnFileUnorderedKVOutput,\n output=ShuffledUnorderedKVInput,\n dataMovement=BROADCAST,\n schedulingType=SEQUENTIAL]" ]; }

      Attachments

        1. shuffle_data.tar.gz
          5.60 MB
          Rajesh Balamohan

        Issue Links

          Activity

            People

              rajesh.balamohan Rajesh Balamohan
              rajesh.balamohan Rajesh Balamohan
              Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

              Dates

                Created:
                Updated: