Uploaded image for project: 'Apache Arrow'
  1. Apache Arrow
  2. ARROW-6001

[Python] Add from_pylist() and to_pylist() to pyarrow.Table to convert list of records

Details

    • Improvement
    • Status: Resolved
    • Minor
    • Resolution: Fixed
    • None
    • 7.0.0
    • Python

    Description

      I noticed that pyarrow.Table.to_pydict() exists, but pyarrow.Table.from_pydict() doesn't exist. There is a proposed ticket to create one, but it doesn't take into account potential mismatches between column order and number of columns.

      I'm including some code I've written which I've been using to handle arrow conversions to ordered dictionaries and lists of dictionaries.. I've also included an example where this can be used to speed up pandas.to_dict() by a factor of 6x.

       

      def from_pylist(pylist, names=None, schema=None, safe=True):
          """
          Converts a python list of dictionaries to a pyarrow table
          :param pylist: pylist list of dictionaries
          :param names: list of column names
          :param schema: pyarrow schema
          :param safe: True or False
          :return: arrow table
          """
          arrow_columns = list()
          if schema:
              for column in schema.names:
                  arrow_columns.append(pa.array([v[column] if column in v else None for v in pylist], safe=safe, type=schema.types[schema.get_field_index(column)]))
              arrow_table = pa.Table.from_arrays(arrow_columns, schema.names)
          else:
              for column in names:
                  arrow_columns.append(pa.array([v[column] if column in v else None for v in pylist], safe=safe))
              arrow_table = pa.Table.from_arrays(arrow_columns, names)
          return arrow_table
      
      def to_pylist(arrow_table, index_columns=None):
          """
          Converts a pyarrow table to a python list of dictionaries
          :param arrow_table: arrow table
          :param index_columns: columns to index
          :return: python list of dictionaries
          """
          pydict = arrow_table.to_pydict()
          if index_columns:
              columns = arrow_table.schema.names
              columns.append("_index")
              pylist = [{column: tuple([pydict[index_column][row] for index_column in index_columns]) if column == '_index' else pydict[column][row] for column in columns} for row in range(arrow_table.num_rows)]
          else:
              pylist = [{column: pydict[column][row] for column in arrow_table.schema.names} for row in range(arrow_table.num_rows)]
          return pylist
      
      def from_pydict(pydict, names=None, schema=None, safe=True):
          """
          Converts a pyarrow table to a python ordered dictionary
          :param pydict: ordered dictionary
          :param names: list of column names
          :param schema: pyarrow schema
          :param safe: True or False
          :return: arrow table
          """
          arrow_columns = list()
          dict_columns = list(pydict.keys())
          if schema:
              for column in schema.names:
                  if column in pydict:
                      arrow_columns.append(pa.array(pydict[column], safe=safe, type=schema.types[schema.get_field_index(column)]))
                  else:
                      arrow_columns.append(pa.array([None] * len(pydict[dict_columns[0]]), safe=safe, type=schema.types[schema.get_field_index(column)]))
              arrow_table = pa.Table.from_arrays(arrow_columns, schema.names)
          else:
              if not names:
                  names = dict_columns
              for column in names:
                  if column in dict_columns:
                      arrow_columns.append(pa.array(pydict[column], safe=safe))
                  else:
                      arrow_columns.append(pa.array([None] * len(pydict[dict_columns[0]]), safe=safe))
              arrow_table = pa.Table.from_arrays(arrow_columns, names)
          return arrow_table
      
      def get_indexed_values(arrow_table, index_columns):
          """
          returns back a set of unique values for a list of columns.
          :param arrow_table: arrow_table
          :param index_columns: list of column names
          :return: set of tuples
          """
          pydict = arrow_table.to_pydict()
          index_set = set([tuple([pydict[index_column][row] for index_column in index_columns]) for row in range(arrow_table.num_rows)])
          return index_set
      

      Here are my benchmarks using pandas to arrow to python vs of pandas.to_dict()

       

      # benchmark panda conversion to python objects
      print('**benchmark 1 million rows**')
      start_time = time.time()
      python_df1 = panda_df1.to_dict(orient='records')
      total_time = time.time() - start_time
      print("pandas to python: " + str(total_time))
      
      start_time = time.time()
      arrow_df1 = pa.Table.from_pandas(panda_df1)
      pydict = arrow_df1.to_pydict()
      python_df1 = [{column: pydict[column][row] for column in arrow_df1.schema.names} for row in range(arrow_df1.num_rows)]
      total_time = time.time() - start_time
      print("pandas to arrow to python: " + str(total_time))
      
      print('**benchmark 4 million rows**')
      start_time = time.time()
      python_df4 = panda_df4.to_dict(orient='records')
      total_time = time.time() - start_time
      print("pandas to python:: " + str(total_time))
      
      start_time = time.time()
      arrow_df4 = pa.Table.from_pandas(panda_df4)
      pydict = arrow_df4.to_pydict()
      python_df4 = [{column: pydict[column][row] for column in arrow_df4.schema.names} for row in range(arrow_df4.num_rows)]
      total_time = time.time() - start_time
      print("pandas to arrow to python: " + str(total_time))
      

        

      **benchmark 1 million rows**
      pandas to python: 13.204811334609985
      pandas to arrow to python: 2.00173282623291
      **benchmark 4 million rows**
      pandas to python:: 51.655067682266235
      pandas to arrow to python: 8.562284231185913
      

      Attachments

        Issue Links

          Activity

            People

              alenka Alenka Frim
              davlee1972@yahoo.com David Lee
              Votes:
              0 Vote for this issue
              Watchers:
              6 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved:

                Time Tracking

                  Estimated:
                  Original Estimate - Not Specified
                  Not Specified
                  Remaining:
                  Remaining Estimate - 0h
                  0h
                  Logged:
                  Time Spent - 4h 50m
                  4h 50m

                  Slack

                    Issue deployment