Uploaded image for project: 'aardvark'
  1. aardvark
  2. AAR-51788

[R]: CSV parser got out of sync with chunker

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Open
    • Major
    • Resolution: Unresolved
    • Debian Linux

    Description

      Hello,
      Unfortunately the example involves a large dataset and, according to my tests, it appears when the number of read lines goes above 1.6 million.

      The data can be downloaded as a compressed file from (nothing dangerous in the link).

      https://e.pcloud.link/publink/show?code=XZqHIeZokLxWCpx940hw3y45fsKqJPAVK0X

      Using a script I have had for quite some time, I want to open the tsv (tab separated file) I get when I decompress the file and then save it as a parquet file without holding it (entirely) in memory.

      Please have a look at the script below. Any ideas?

      #################################################################

      library(arrow)
      #> 
      #> Attaching package: 'arrow'
      #> The following object is masked from 'package:utils':
      #> 
      #>     timestamp

      data <- open_dataset("export.tsv",
        format = "tsv",
        skip_rows = 1, 
        schema = schema(
          AID_MEASURE_ID = string(), 
          DATE_CREATED = string(), 
          DATE_GRANTED = string(), 
          AA_PUBLISHED_DATE = string(), 
          SERVER_REF = string(), 
          AM_TITLE = string(), 
          AM_TITLE_EN = string(), 
          STATUS = string(), 
          AM_PROC_TYPE_CD = string(), 
          COFINANCE = string(), 
          OBJECTIVE = string(), 
          OTHER_OBJECTIVE_EN = string(), 
          AID_INSTRUMENT = string(), 
          OTHER_AID_INSTRUMENT_EN = string(), 
          BENEFICIARY_NAME = string(), 
          BENEFICIARY_NAME_ENGLISH = string(), 
          BENEFICIARY_NATIONAL_ID = string(), 
          BENEFICIARY_NAT_ID_TYPE_SD = string(), 
          BENEFICIARY_TYPE_SD = string(), 
          COUNTRY_SD = string(), 
          REGION_SD = string(), 
          SECTOR_SD = string(), 
          GRANTED_AMOUNT_FROM_EUR = double(), 
          NOMINAL_AMOUNT_EUR_FROM = double(), 
          GRANT_RANGE = string(),
          GRANTED_AMOUNT_RANGE_DESC=string(),
          GRANTING_AUTHORITY_NAME = string(), 
          GRANTING_AUTHORITY_NAME_EN = string(), 
          NUTS_CD = string(), 
          GRANTING_AUTHORITY_COUNTRY = string()
        )
        )

        
      write_dataset(
        data,
        format = "parquet",
        path = ".",
        max_rows_per_file = 1e7
      )
      #> Error: Invalid: CSV parser got out of sync with chunker
          

      sessionInfo()
      #> R version 4.3.2 (2023-10-31)
      #> Platform: x86_64-pc-linux-gnu (64-bit)
      #> Running under: Debian GNU/Linux 12 (bookworm)
      #> 
      #> Matrix products: default
      #> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.11.0 
      #> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.11.0
      #> 
      #> locale:
      #>  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
      #>  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
      #>  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
      #>  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
      #>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
      #> [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
      #> 
      #> time zone: Europe/Brussels
      #> tzcode source: system (glibc)
      #> 
      #> attached base packages:
      #> [1] stats     graphics  grDevices utils     datasets  methods   base     
      #> 
      #> other attached packages:
      #> [1] arrow_14.0.0.2
      #> 
      #> loaded via a namespace (and not attached):
      #>  [1] vctrs_0.6.4       cli_3.6.1         knitr_1.45        rlang_1.1.2      
      #>  [5] xfun_0.41         purrr_1.0.2       styler_1.10.2     generics_0.1.3   
      #>  [9] assertthat_0.2.1  glue_1.6.2        bit_4.0.5         htmltools_0.5.7  
      #> [13] fansi_1.0.5       rmarkdown_2.25    R.cache_0.16.0    tibble_3.2.1     
      #> [17] evaluate_0.23     fastmap_1.1.1     yaml_2.3.7        lifecycle_1.0.4  
      #> [21] compiler_4.3.2    dplyr_1.1.3       fs_1.6.3          pkgconfig_2.0.3  
      #> [25] R.oo_1.25.0       R.utils_2.12.2    digest_0.6.33     R6_2.5.1         
      #> [29] utf8_1.2.4        reprex_2.0.2      tidyselect_1.2.0  pillar_1.9.0     
      #> [33] magrittr_2.0.3    R.methodsS3_1.8.2 tools_4.3.2       withr_2.5.2      
      #> [37] bit64_4.0.5

      #################################################

      Attachments

        Activity

          People

            Unassigned Unassigned
            larry77 Lorenzo Isella
            Votes:
            0 Vote for this issue
            Watchers:
            1 Start watching this issue

            Dates

              Created:
              Updated:

              Time Tracking

                Estimated:
                Original Estimate - 24h
                24h
                Remaining:
                Remaining Estimate - 24h
                24h
                Logged:
                Time Spent - Not Specified
                Not Specified