a
    <b/v                     @  s  d Z ddlmZ ddlmZ ddlmZmZ ddlZddlm	Z	 ddl
ZddlmZmZ ddlmZmZ ddlZdd	lmZmZ dd
lmZ ddlmZ ddlm  m  mZ ddlm Z  dddddZ!ddddddZ"G dd dZ#G dd dZ$G dd de ej%Z&dS )a  
Read SAS7BDAT files

Based on code written by Jared Hobbs:
  https://bitbucket.org/jaredhobbs/sas7bdat

See also:
  https://github.com/BioStatMatt/sas7bdat

Partial documentation of the file format:
  https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf

Reference for binary data compression:
  http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
    )annotations)abc)datetime	timedeltaN)cast)FilePath
ReadBuffer)EmptyDataErrorOutOfBoundsDatetime)	DataFrameisna)
get_handle)Parser)
ReaderBasefloatstrZsas_datetimeunitc                 C  sV   t | rtjS |dkr,tdddt| d S |dkrJtdddt| d S tdd S )Ns     )secondsd)dayszunit must be 'd' or 's')r   pdZNaTr   r   
ValueErrorr    r   f/Users/vegardjervell/Documents/master/model/venv/lib/python3.9/site-packages/pandas/io/sas/sas7bdat.py_parse_datetime1   s    r   z	pd.Series)sas_datetimesr   returnc                 C  sH   zt j| |ddW S  tyB   | jt|d}tt j|}| Y S 0 dS )a  
    Convert to Timestamp if possible, otherwise to datetime.datetime.
    SAS float64 lacks precision for more than ms resolution so the fit
    to datetime.datetime is ok.

    Parameters
    ----------
    sas_datetimes : {Series, Sequence[float]}
       Dates or datetimes in SAS
    unit : {str}
       "d" if the floats represent dates, "s" for datetimes

    Returns
    -------
    Series
       Series of datetime64 dtype or datetime.datetime.
    z
1960-01-01)r   originr   N)r   to_datetimer
   applyr   r   Series)r   r   Zs_seriesr   r   r   _convert_datetimes?   s    r&   c                   @  sB   e Zd ZU ded< ded< ded< ded< dddddddZd	S )
_SubheaderPointerintoffsetlengthcompressionptyper)   r*   r+   r,   c                 C  s   || _ || _|| _|| _d S Nr-   )selfr)   r*   r+   r,   r   r   r   __init___   s    z_SubheaderPointer.__init__N__name__
__module____qualname____annotations__r0   r   r   r   r   r'   Y   s
   
r'   c                   @  sV   e Zd ZU ded< ded< ded< ded< ded< ded	< ddddddd
ddZdS )_Columnr(   col_idzstr | bytesnamelabelformatbytesctyper*   r7   r8   r9   r:   r<   r*   c                 C  s(   || _ || _|| _|| _|| _|| _d S r.   r=   )r/   r7   r8   r9   r:   r<   r*   r   r   r   r0   n   s    
z_Column.__init__Nr1   r   r   r   r   r6   f   s   
r6   c                   @  s  e Zd ZU dZded< ded< dVdd	d
dZddddZddddZddddZddddZ	ddddZ
dd ZdddddZdddddd Zddd!d"d#Zddd$d%Zd&dd'd(Zd)d* Zddd+d,Zd-dd.d/d0Zddd1d2d3d4Zdd-d5d6d7Zdd1dd8d9d:Zdddd;d<d=Zdddd;d>d?Zdddd;d@dAZdddd;dBdCZdddd;dDdEZdddd;dFdGZdddd;dHdIZdddd;dJdKZdWdLdMdNdOdPZdQdR Z dSddTdUZ!dS )XSAS7BDATReadera  
    Read SAS files in SAS7BDAT format.

    Parameters
    ----------
    path_or_buf : path name or buffer
        Name of SAS file or file-like object pointing to SAS file
        contents.
    index : column identifier, defaults to None
        Column to use as index.
    convert_dates : bool, defaults to True
        Attempt to convert dates to Pandas datetime values.  Note that
        some rarely used SAS date formats may be unsupported.
    blank_missing : bool, defaults to True
        Convert empty strings to missing values (SAS uses blanks to
        indicate missing character variables).
    chunksize : int, defaults to None
        Return SAS7BDATReader object for iterations, returns chunks
        with given number of lines.
    encoding : string, defaults to None
        String encoding.
    convert_text : bool, defaults to True
        If False, text variables are left as raw bytes.
    convert_header_text : bool, defaults to True
        If False, header text, including column names, are left as raw
        bytes.
    r(   _int_lengthzbytes | None_cached_pageNTzFilePath | ReadBuffer[bytes])path_or_bufc	           	      C  s   || _ || _|| _|| _|| _|| _|| _d| _d| _g | _	g | _
g | _g | _g | _d | _g | _g | _g | _d| _d| _d| _t|ddd| _| jj| _z|   |   W n ty   |    Y n0 d S )Nzlatin-1    r   rbF)Zis_text)indexconvert_datesblank_missing	chunksizeencodingconvert_textconvert_header_textdefault_encodingr+   column_names_stringscolumn_namescolumn_formatscolumns%_current_page_data_subheader_pointersr@   _column_data_lengths_column_data_offsets_column_types_current_row_in_file_indexZ_current_row_on_page_indexr   handleshandle_path_or_buf_get_properties_parse_metadata	Exceptionclose)	r/   rA   rD   rE   rF   rG   rH   rI   rJ   r   r   r   r0      s:    
zSAS7BDATReader.__init__z
np.ndarray)r    c                 C  s   t j| jt jdS )z5Return a numpy int64 array of the column data lengthsdtype)npasarrayrQ   int64r/   r   r   r   column_data_lengths   s    z"SAS7BDATReader.column_data_lengthsc                 C  s   t j| jt jdS )z0Return a numpy int64 array of the column offsetsr\   )r^   r_   rR   r`   ra   r   r   r   column_data_offsets   s    z"SAS7BDATReader.column_data_offsetsc                 C  s   t j| jt ddS )zj
        Returns a numpy character array of the column types:
           s (string) or d (double)
        ZS1r\   )r^   r_   rS   r]   ra   r   r   r   column_types   s    zSAS7BDATReader.column_typesNonec                 C  s   | j   d S r.   )rU   r[   ra   r   r   r   r[      s    zSAS7BDATReader.closec                 C  s  | j d | j d| _| jdttj tjkr<tdd\}}| tj	tj
}|tjkrtj}d| _d| _tj| _tj| _nd| _tj| _tj| _d| _| tjtj}|tjkrtj}|| }| tjtj}|d	krd
| _nd| _| tjtjd }|tjv rtj| | _nd| d| _| tjtj }|dkrJd| _!n|dkr\d| _!nd| _!| tj"tj#}|$d| _%| j&r| j%'| j(p| j)| _%| tj*tj+}|$d| _,| j&r| j,'| j(p| j)| _,t-ddd}| .tj/| tj0}|t1j2|dd | _3| .tj4| tj5}|t1j2|dd | _6| 7tj8| tj9| _:| j | j:d }|  j|7  _t| j| j:krtd| 7tj;| tj<| _=| 7tj>| tj?| _@| tjA| tjB}|$d| _C| j&r| jC'| j(p| j)| _C| tjD| tjE}|$d| _F| j&r2| jF'| j(p,| j)| _F| tjG| tjH}|$d| _I| j&rr| jI'| j(pl| j)| _I| tjJ| tjK}|$d}t|dkr|'| j(p| j)| _Ln@| tjM| tjN}|$d| _L| j&r| jL'| j(p| j)| _Ld S )Nr   i   z'magic number mismatch (not a SAS file?)r   r   T   F      <>zunknown (code=)   1unix   2Zwindowsunknown     r   r   r   r"   z*The SAS7BDAT file appears to be truncated.)OrW   seekreadr@   lenconstmagicr   _read_bytesZalign_1_offsetZalign_1_lengthZu64_byte_checker_valueZalign_2_valueU64r?   Zpage_bit_offset_x64_page_bit_offsetZsubheader_pointer_length_x64_subheader_pointer_lengthZpage_bit_offset_x86Zsubheader_pointer_length_x86Zalign_2_offsetZalign_2_lengthZalign_1_checker_valueZendianness_offsetZendianness_length
byte_orderZencoding_offsetZencoding_lengthZencoding_namesfile_encodingZplatform_offsetZplatform_lengthplatformZdataset_offsetZdataset_lengthrstripr8   rJ   decoderH   rK   Zfile_type_offsetZfile_type_length	file_typer   _read_floatZdate_created_offsetZdate_created_lengthr   Zto_timedeltaZdate_createdZdate_modified_offsetZdate_modified_lengthZdate_modified	_read_intZheader_size_offsetZheader_size_lengthheader_lengthZpage_size_offsetZpage_size_length_page_lengthZpage_count_offsetZpage_count_lengthZ_page_countZsas_release_offsetZsas_release_lengthZsas_releaseZsas_server_type_offsetZsas_server_type_lengthZserver_typeZos_version_number_offsetZos_version_number_length
os_versionZos_name_offsetZos_name_lengthos_nameZos_maker_offsetZos_maker_length)r/   Zalign1Zalign2bufZtotal_alignepochxr   r   r   rX      s    





zSAS7BDATReader._get_propertiesc                 C  s*   | j | jpdd}|d u r&|   t|S )Nr   )nrows)rs   rG   r[   StopIteration)r/   dar   r   r   __next__h  s
    zSAS7BDATReader.__next__)r)   widthc                 C  sJ   |dvr|    td| ||}|dkr0dnd}t| j| |d S )N)rh   rg   zinvalid float widthrh   fr   r   r[   r   rw   structunpackr{   )r/   r)   r   r   fdr   r   r   r   p  s    zSAS7BDATReader._read_float)r)   r   r    c                 C  sP   |dvr|    td| ||}ddddd| }t| j| |d }|S )N)r      rh   rg   zinvalid int widthbhlqr   r   )r/   r)   r   r   itZivr   r   r   r   y  s    zSAS7BDATReader._read_int)r)   r*   c                 C  s   | j d u rX| j| | j|}t||k rT|   d|dd|dd}t||S || t| j krz|   td| j |||  S d S )NzUnable to read r   z bytes from file position .zThe cached page is too small.)r@   rW   rr   rs   rt   r[   r   )r/   r)   r*   r   msgr   r   r   rw     s    
zSAS7BDATReader._read_bytesc                 C  sN   d}|sJ| j | j| _t| jdkr(qJt| j| jkr@td|  }qd S )NFr   z2Failed to read a meta data page from the SAS file.)rW   rs   r   r@   rt   r   _process_page_meta)r/   doner   r   r   rY     s    zSAS7BDATReader._parse_metadataboolc                 C  sZ   |    tjtjgtj }| j|v r,|   | jtj@ }| jtjv }t|pV|pV| j	g kS r.   )
_read_page_headerru   page_meta_typeZpage_amd_typepage_mix_types_current_page_type_process_page_metadatapage_data_typer   rP   )r/   ptis_data_pageZis_mix_pager   r   r   r     s    
z!SAS7BDATReader._process_page_metac                 C  sX   | j }tj| }| |tj| _tj| }| |tj| _tj	| }| |tj
| _d S r.   )ry   ru   Zpage_type_offsetr   Zpage_type_lengthr   Zblock_count_offsetZblock_count_lengthZ_current_page_block_countZsubheader_count_offsetZsubheader_count_length_current_page_subheaders_count)r/   
bit_offsetZtxr   r   r   r     s    


z SAS7BDATReader._read_page_headerc                 C  sp   | j }t| jD ]Z}| tj| |}|jdkr2q|jtjkr@q| 	|j
}| ||j|j}| || qd S )Nr   )ry   ranger   _process_subheader_pointersru   Zsubheader_pointers_offsetr*   r+   Ztruncated_subheader_id_read_subheader_signaturer)   _get_subheader_indexr,   _process_subheader)r/   r   ipointersubheader_signaturesubheader_indexr   r   r   r     s    


z%SAS7BDATReader._process_page_metadatar;   )	signaturer    c                 C  s`   t j|}|d u r\|t jkp$|dk}|t jk}| jdkrL|rL|rLt jj}n|   t	d|S )Nr   rB   zUnknown subheader signature)
ru   Zsubheader_signature_to_indexgetZcompressed_subheader_idZcompressed_subheader_typer+   SASIndexdata_subheader_indexr[   r   )r/   r   r+   r,   rD   f1f2r   r   r   r     s    

z#SAS7BDATReader._get_subheader_indexr'   )r)   subheader_pointer_indexr    c           
      C  st   | j }|||  }| || j}|| j7 }| || j}|| j7 }| |d}|d7 }| |d}t||||}	|	S )Nr   )rz   r   r?   r'   )
r/   r)   r   Zsubheader_pointer_lengthZtotal_offsetZsubheader_offsetZsubheader_lengthZsubheader_compressionZsubheader_typer   r   r   r   r     s    

z*SAS7BDATReader._process_subheader_pointers)r)   r    c                 C  s   |  || j}|S r.   )rw   r?   )r/   r)   r   r   r   r   r     s    z(SAS7BDATReader._read_subheader_signature)r   r   r    c                 C  s   |j }|j}|tjjkr | j}n|tjjkr4| j}n|tjjkrH| j	}n|tjj
kr\| j}nt|tjjkrp| j}n`|tjjkr| j}nL|tjjkr| j}n8|tjjkr| j}n$|tjjkr| j| d S td||| d S )Nzunknown subheader index)r)   r*   ru   r   Zrow_size_index_process_rowsize_subheaderZcolumn_size_index_process_columnsize_subheaderZcolumn_text_index_process_columntext_subheaderZcolumn_name_index_process_columnname_subheaderZcolumn_attributes_index#_process_columnattributes_subheaderZformat_and_label_index_process_format_subheaderZcolumn_list_index_process_columnlist_subheaderZsubheader_counts_index_process_subheader_countsr   rP   appendr   )r/   r   r   r)   r*   	processorr   r   r   r     s.    z!SAS7BDATReader._process_subheader)r)   r*   r    c                 C  s   | j }|}|}| jr&|d7 }|d7 }n|d7 }|d7 }| |tj|  || _| |tj|  || _| |tj|  || _	| |tj
|  || _tj| }| || || _| |d| _| |d| _d S )Ni  i  ib  iz  r   )r?   rx   r   ru   Zrow_length_offset_multiplierZ
row_lengthZrow_count_offset_multiplier	row_countZcol_count_p1_multipliercol_count_p1Zcol_count_p2_multipliercol_count_p2Z'row_count_on_mix_page_offset_multiplierZ_mix_page_row_count_lcs_lcp)r/   r)   r*   int_lenZ
lcs_offsetZ
lcp_offsetZmxr   r   r   r     s0    

z)SAS7BDATReader._process_rowsize_subheaderc                 C  sT   | j }||7 }| ||| _| j| j | jkrPtd| j d| j d| j d d S )Nz Warning: column count mismatch (z + z != z)
)r?   r   column_countr   r   print)r/   r)   r*   r   r   r   r   r   (  s    
z,SAS7BDATReader._process_columnsize_subheaderc                 C  s   d S r.   r   r/   r)   r*   r   r   r   r   3  s    z(SAS7BDATReader._process_subheader_countsc           
      C  s  || j 7 }| |tj}| ||}|d| d}|}| jrR|| jpN| j	}| j
| t| j
dkrd}tjD ]}||v rx|}qx|| _|| j 8 }|d }	| jr|	d7 }	| |	| j}|d}|dkrd| _|d }	| jr|	d7 }	| |	| j}|d| j | _n|tjkrR|d	 }	| jr2|	d7 }	| |	| j}|d| j | _nH| jdkrd| _|d }	| jr||	d7 }	| |	| j}|d| j | _| jrt| d
r| j| jp| j	| _d S )Nr   rq   r   rB      rh           (   creator_proc)r?   r   ru   Ztext_block_size_lengthrw   r~   rJ   r   rH   rK   rL   r   rt   Zcompression_literalsr+   rx   r   r   r   Zrle_compressionhasattr)
r/   r)   r*   Ztext_block_sizer   Z	cname_rawcnameZcompression_literalZclZoffset1r   r   r   r   6  sZ    




z,SAS7BDATReader._process_columntext_subheaderc                 C  s   | j }||7 }|d|  d d }t|D ]}|tj|d   tj }|tj|d   tj }|tj|d   tj }| |tj}	| |tj	}
| |tj
}| j|	 }| j||
|
|   q*d S )Nr      rg   r   )r?   r   ru   Zcolumn_name_pointer_lengthZ!column_name_text_subheader_offsetZcolumn_name_offset_offsetZcolumn_name_length_offsetr   Z!column_name_text_subheader_lengthZcolumn_name_offset_lengthZcolumn_name_length_lengthrL   rM   r   )r/   r)   r*   r   Zcolumn_name_pointers_countr   Ztext_subheaderZcol_name_offsetZcol_name_lengthidx
col_offsetZcol_lenZname_strr   r   r   r   j  s>    
z,SAS7BDATReader._process_columnname_subheaderc           
      C  s   | j }|d|  d |d  }t|D ]}|| tj ||d   }|d|  tj ||d   }|d|  tj ||d   }| ||}	| j|	 | |tj	}	| j
|	 | |tj}	| j|	dkrdnd q&d S )Nr   r   rg   r      d   s)r?   r   ru   Zcolumn_data_offset_offsetZcolumn_data_length_offsetZcolumn_type_offsetr   rR   r   Zcolumn_data_length_lengthrQ   Zcolumn_type_lengthrS   )
r/   r)   r*   r   Zcolumn_attributes_vectors_countr   Zcol_data_offsetZcol_data_lenZ	col_typesr   r   r   r   r     s*    
z2SAS7BDATReader._process_columnattributes_subheaderc                 C  s   d S r.   r   r   r   r   r   r     s    z,SAS7BDATReader._process_columnlist_subheaderc                 C  sl  | j }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }	| |tj	}
t
|
t| jd }| |tj}| |tj}| |tj}t
|t| jd }| |tj}| |	tj}| j| }||||  }| j| }||||  }t| j}t|| j| ||| j| | j| }| j| | j| d S )N   r   )r?   ru   Z)column_format_text_subheader_index_offsetZcolumn_format_offset_offsetZcolumn_format_length_offsetZ(column_label_text_subheader_index_offsetZcolumn_label_offset_offsetZcolumn_label_length_offsetr   Z)column_format_text_subheader_index_lengthminrt   rL   Zcolumn_format_offset_lengthZcolumn_format_length_lengthZ(column_label_text_subheader_index_lengthZcolumn_label_offset_lengthZcolumn_label_length_lengthrO   r6   rM   rS   rQ   rN   r   )r/   r)   r*   r   Ztext_subheader_formatZcol_format_offsetZcol_format_lenZtext_subheader_labelZcol_label_offsetZcol_label_lenr   Z
format_idxZformat_startZ
format_lenZ	label_idxZlabel_startZ	label_lenZlabel_namesZcolumn_labelZformat_namesZcolumn_formatZcurrent_column_numbercolr   r   r   r     sL    


	z(SAS7BDATReader._process_format_subheaderz
int | NonezDataFrame | None)r   r    c                 C  s   |d u r| j d ur| j }n|d u r(| j}t| jdkrF|   td| j| jkrVd S | j| j }||krn|}| jd}| jd}tj	||ft
d| _tj|d| ftjd| _d| _t| }|| |  }| jd ur|| j}|S )Nr   zNo columns to parse from filer   r   r\   rg   )rG   r   rt   rS   r[   r	   rT   countr^   emptyobject_string_chunkZzerosZuint8_byte_chunk_current_row_in_chunk_indexr   rs   _chunk_to_dataframerD   Z	set_index)r/   r   mndnsprsltr   r   r   rs     s.    

zSAS7BDATReader.readc                 C  s   g | _ | j| j| _t| jdkr(dS t| j| jkrf|   dt| jdd| jdd}t||   | j	}|t
jkr|   |t
j@ }t
jgt
j }|s| j	|vr|  S dS )Nr   Tz-failed to read complete page from file (read r   z of z bytes)F)rP   rW   rs   r   r@   rt   r[   r   r   r   ru   r   r   r   r   _read_next_page)r/   r   Z	page_typer   r   r   r   r   r     s,    


zSAS7BDATReader._read_next_pager   c                 C  s  | j }| j}t|| |}i }d\}}t| jD ]\}| j| }| j| dkr| j|d d f j| jd d}	t	j
|	tj|d||< | jr| j| tjv rt|| d||< n"| j| tjv rt|| d||< |d7 }q0| j| dkrnt	j
| j|d d f |d	||< | jr<| jd ur<|| j| jp4| j||< | jrd|| j d
k}
tj|| |
< |d7 }q0|   tdt| j|  q0t|| j|dd}|S )Nrf   r   r   r\   )r]   rD   r   r   r   )rD   r   zunknown column type F)rO   rD   copy) r   rT   r   r   rM   rS   r   viewr{   r   r%   r^   Zfloat64rE   rN   ru   Zsas_date_formatsr&   Zsas_datetime_formatsr   rI   rH   r   r   rK   rF   rt   nanr[   r   reprr   )r/   nr   ixr   ZjsZjbjr8   Zcol_arriiZdfr   r   r   r     s<    
 
 

z"SAS7BDATReader._chunk_to_dataframe)NTTNNTT)N)"r2   r3   r4   __doc__r5   r0   rb   rc   rd   r[   rX   r   r   r   rw   rY   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rs   r   r   r   r   r   r   r>      sP   
       0 		
4 1"r>   )'r   
__future__r   collectionsr   r   r   r   typingr   Znumpyr^   Zpandas._typingr   r   Zpandas.errorsr	   r
   Zpandasr   r   r   Zpandas.io.commonr   Zpandas.io.sas._sasr   Zpandas.io.sas.sas_constantsioZsasZsas_constantsru   Zpandas.io.sas.sasreaderr   r   r&   r'   r6   Iteratorr>   r   r   r   r   <module>   s&   