o
     eO                  	   @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZmZmZmZmZ ddlmZmZmZmZ ddlmZ ddlmZ dd	lmZ d
Zd
ZdZdZddddddddZG dd dZ G dd dZ!G dd dZ"dededefddZ#e $e!j%j&' Z(e $e"j&' Z)dedede fdd Z*dedede+fd!d"Z,efd#ed$ede+fd%d&Z-eefded'ed(ede+fd)d*Z.d-ded$edefd+d,Z/dS ).    N)
itemgetter)	AnyDict	GeneratorListMatchOptionalPatternTupleUnion   )T_numT_obj
T_obj_iter
T_obj_list   )cluster_objects)to_list)objects_to_bbox   g      @   ffffifflfiflst)u   ﬀu   ﬃu   ﬄu   ﬁu   ﬂu   ﬆu   ﬅc                   @   s   e Zd ZdZdeeeee f  ddfddZ				dd	e
e d
edededeeef f
ddZ					ddeeee f dedededed
edeeeef  fddZ	ddededeeeef  fddZdS )TextMapz
    A TextMap maps each unicode character in the text to an individual `char`
    object (or, in the case of layout-implied whitespace, `None`).
    tuplesreturnNc                 C   s    || _ dttd|| _d S )N r   )r   joinmapr   	as_stringselfr    r&   H/var/www/html/venv/lib/python3.10/site-packages/pdfplumber/utils/text.py__init__#   s   zTextMap.__init__r   Tm
main_groupreturn_groupsreturn_charsc                 C   sn   | j |||| }dd |D }t|\}}}	}
|||||	|
d}|r/| |d< |r5||d< |S )Nc                 S   s   g | ]
\}}|d ur|qS Nr&   ).0textcr&   r&   r'   
<listcomp>/   s    z)TextMap.match_to_dict.<locals>.<listcomp>)r/   x0topx1bottomgroupschars)r   startendr   groupr6   )r%   r)   r*   r+   r,   subsetr7   r2   r3   r4   r5   resultr&   r&   r'   match_to_dict'   s   zTextMap.match_to_dictpatternregexcasec                    s   t |tr|du rtd|du rtd|}n|du r!t|}|du r(tjnd}t||}t|j}	t	 fdd|	}
 fdd|
D S )	NFzACannot pass a compiled search pattern *and* regex=False together.z@Cannot pass a compiled search pattern *and* case=False together.r   c                    s   t |   S r-   )boolr:   strip)r)   )r*   r&   r'   <lambda>`   s    z TextMap.search.<locals>.<lambda>c                    s   g | ]}j | d qS ))r+   r,   r*   )r=   )r.   r)   r*   r,   r+   r%   r&   r'   r1   a   s    z"TextMap.search.<locals>.<listcomp>)

isinstancer	   
ValueErrorreescapeIcompilefinditerr#   filter)r%   r>   r?   r@   r+   r,   r*   compiledflagsgenfilteredr&   rD   r'   searchB   s&   


zTextMap.searchrB   c                 C   s    |rd}nd}| j |d|ddS )af  
        `strip` is analogous to Python's `str.strip()` method, and returns
        `text` attributes without their surrounding whitespace. Only
        relevant when the relevant TextMap is created with `layout` = True

        Setting `return_chars` to False will exclude the individual
        character objects from the returned text-line dicts.
        z *([^\n]+?) *(\n|$)z([^\n]+)r   F)r*   r,   r+   )rQ   )r%   rB   r,   patr&   r&   r'   extract_text_linesk   s   zTextMap.extract_text_lines)r   TT)TTTTr   )TT)__name__
__module____qualname____doc__r   r
   strr   r   r(   r   intrA   r   r   r=   r   r	   rQ   rS   r&   r&   r&   r'   r      sZ    "


*r   c                   @   s   e Zd ZdZdeeeef  ddfddZddddde	e
ddeddd	fd
edededededededededededededefddZdS )WordMapz&
    A WordMap maps words->chars.
    r   r   Nc                 C   s
   || _ d S r-   )r   r$   r&   r&   r'   r(      s   
zWordMap.__init__Fr   Tlayoutlayout_widthlayout_heightlayout_width_charslayout_height_chars	x_density	y_densityx_shifty_shifty_toleranceuse_text_flow	presortedexpand_ligaturesc           #      C   s  g }t | jst|S |rtni }|r;|r|rtdntt|| }|r-|r,tdntt|| }dg| }ng }d}|sC|rF| jnt| jdd d}|d d }|d |d	  }tt	|d
d |
|pg|dD ]\}}|r|d d d ||	  | nd}t
t|dkt|| }t|D ]}t |r|d d dkr||7 }|d q||7 }d}|s|r|nt|dd d}|D ]F\}}|r|d | | nd}t
td|t|| }|dg| 7 }||7 }|D ]}||d |d } | D ]}!||!|f |d7 }qqq|r|dg||  7 }qk|r@||d  }"t|"D ]}|dkr,||7 }|d q!|d dkr@|dd }t|S )a  
        Given a list of (word, chars) tuples (i.e., a WordMap), return a list of
        (char-text, char) tuples (i.e., a TextMap) that can be used to mimic the
        structural layout of the text on the page(s), using the following approach:

        - Sort the words by (doctop, x0) if not already sorted.

        - Calculate the initial doctop for the starting page.

        - Cluster the words by doctop (taking `y_tolerance` into account), and
          iterate through them.

        - For each cluster, calculate the distance between that doctop and the
          initial doctop, in points, minus `y_shift`. Divide that distance by
          `y_density` to calculate the minimum number of newlines that should come
          before this cluster. Append that number of newlines *minus* the number of
          newlines already appended, with a minimum of one.

        - Then for each cluster, iterate through each word in it. Divide each
          word's x0, minus `x_shift`, by `x_density` to calculate the minimum
          number of characters that should come before this cluster.  Append that
          number of spaces *minus* the number of characters and spaces already
          appended, with a minimum of one. Then append the word's text.

        - At the termination of each line, add more spaces if necessary to
          mimic `layout_width`.

        - Finally, add newlines to the end if necessary to mimic to
          `layout_height`.

        Note: This approach currently works best for horizontal, left-to-right
        text, but will display all words regardless of orientation. There is room
        for improvement in better supporting right-to-left text, as well as
        vertical text.
        z;`layout_width` and `layout_width_chars` cannot both be set.z=`layout_height` and `layout_height_chars` cannot both be set.) Nr   c                 S      t | d d S Nr   doctopfloatxr&   r&   r'   rC          z$WordMap.to_textmap.<locals>.<lambda>keyrk   r3   c                 S   ri   rj   rl   rn   r&   r&   r'   rC      rp   )preserve_order
)ru   Nc                 S   ri   )Nr   r2   rl   rn   r&   r&   r'   rC     rp   r2   r   r/   N)lenr   r   	LIGATURESrF   rY   roundsorted	enumerater   maxrangeappendminget)#r%   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   _textmap
expansions
blank_linenum_newlineswords_sorted_doctop
first_worddoctop_startiwsy_distnum_newlines_prependline_lenline_words_sorted_x0wordr7   x_distnum_spaces_prependr0   lettersletternum_newlines_appendr&   r&   r'   
to_textmap   s   3





zWordMap.to_textmap)rT   rU   rV   rW   r   r
   r   r   r(   DEFAULT_X_DENSITYDEFAULT_Y_DENSITYDEFAULT_Y_TOLERANCErA   r   rY   r   r   r&   r&   r&   r'   rZ      sZ    	
rZ   c                   @   s  e Zd Zeedddddddf	dededededed	ed
eee	  de
ee	f defddZdedefddZdededefddZdedeeddf fddZdedeeddf fddZdedeeeef ddf fddZdedefddZdedefd d!ZdS )"WordExtractorFTNx_tolerancerd   keep_blank_charsre   horizontal_ltrvertical_ttbextra_attrssplit_at_punctuationrg   c
           
      C   sf   || _ || _|| _|| _|| _|| _|d u rg n|| _|du r"tjn|p%d| _	|	r.t
| _d S i | _d S )NTr    )r   rd   r   re   r   r   r   stringpunctuationr   rw   r   )
r%   r   rd   r   re   r   r   r   r   rg   r&   r&   r'   r(   %  s   zWordExtractor.__init__ordered_charsr   c              	      s   t |\}}}}|d d |d d  }|d d }|r  jr%n jr%dnd}d fdd	|D ||||| |||d
}	 jD ]
}
|d |
 |	|
< qA|	S )Nr   rk   r3   uprightr   rt   r    c                 3   s&    | ]} j |d  |d  V  qdS r/   N)r   r   r.   r0   r%   r&   r'   	<genexpr>J  s    
z,WordExtractor.merge_chars.<locals>.<genexpr>)r/   r2   r4   r3   rk   r5   r   	direction)r   r   r   r!   r   )r%   r   r2   r3   r4   r5   
doctop_adjr   r   r   rr   r&   r   r'   merge_charsB  s$   
zWordExtractor.merge_chars	prev_char	curr_charc           
      C   s   |d r2| j }| j}|d }|d }| jr"|d }|d }|d }	n=|d  }|d  }|d  }	n-| j}| j }|d }|d }| jrP|d }|d }|d }	n|d  }|d  }|d  }	t|	|k po|	|| kpo||| kS )a  This method takes several factors into account to determine if
        `curr_char` represents the beginning of a new word:

        - Whether the text is "upright" (i.e., non-rotated)
        - Whether the user has specified that horizontal text runs
          left-to-right (default) or right-to-left, as represented by
          self.horizontal_ltr
        - Whether the user has specified that vertical text the text runs
          top-to-bottom (default) or bottom-to-top, as represented by
          self.vertical_ttb
        - The x0, top, x1, and bottom attributes of prev_char and
          curr_char
        - The self.x_tolerance and self.y_tolerance settings. Note: In
          this case, x/y refer to those directions for non-rotated text.
          For vertical text, they are flipped. A more accurate terminology
          might be "*intra*line character distance tolerance" and
          "*inter*line character distance tolerance"

        An important note: The *intra*line distance is measured from the
        *end* of the previous character to the *beginning* of the current
        character, while the *inter*line distance is measured from the
        *top* of the previous character to the *top* of the next
        character. The reasons for this are partly repository-historical,
        and partly logical, as successive text lines' bounding boxes often
        overlap slightly (and we don't want that overlap to be interpreted
        as the two lines being the same line).

        The upright-ness of the character determines the attributes to
        compare, while horizontal_ltr/vertical_ttb determine the direction
        of the comparison.
        r   r3   r2   r4   r5   )r   rd   r   r   rA   )
r%   r   r   ro   yaycyaxbxcxr&   r&   r'   char_begins_new_word[  s:   '








z"WordExtractor.char_begins_new_wordc                 #   s    g  dt t dttd d f f fdd}|D ]@}|d }| js,| r,|d E d H  q|| jv r@||E d H  |d E d H  q rR|  d |rR||E d H  q | q r_ V  d S d S )Nnew_charr   c                 3   s&     r V  | d u rg  d S | g d S r-   r&   )r   current_wordr&   r'   start_next_word  s   z:WordExtractor.iter_chars_to_words.<locals>.start_next_wordr/   rt   )	r   r   r   r   r   isspacer   r   r}   )r%   r   r   charr/   r&   r   r'   iter_chars_to_words  s(   


z!WordExtractor.iter_chars_to_wordsr7   c           
      c   s    dt dtfdd}tt||dD ]@}|d d }|rdnd}t|t|| j}|D ]&}|r1dnd}t|t|d	}	|rA| jsLn| jsLt	|	E d H  q+|	E d H  q+qd S )
Nro   r   c                 S   s   t | d  S Nr   )rY   rn   r&   r&   r'   upright_key  s   z2WordExtractor.iter_sort_chars.<locals>.upright_keyr   r   rk   r2   rq   )
r   rY   r   listr   rd   ry   r   r   reversed)
r%   r7   r   upright_clusterr   cluster_keysubclustersscsort_keyto_yieldr&   r&   r'   iter_sort_chars  s    zWordExtractor.iter_sort_charsc                 c   sf    | j r|n| |}tdg| jR  }t||}|D ]\}}| |D ]
}| ||fV  q%qd S r   )re   r   r   r   	itertoolsgroupbyr   r   )r%   r7   r   grouping_keygrouped_charskeyvals
char_group
word_charsr&   r&   r'   iter_extract_tuples  s   z!WordExtractor.iter_extract_tuplesc                 C   s   t t| |S r-   )rZ   r   r   r%   r7   r&   r&   r'   extract_wordmap  s   zWordExtractor.extract_wordmapc                 C   s   t dd | |D S )Nc                 s   s    | ]\}}|V  qd S r-   r&   )r.   r   r   r&   r&   r'   r         z.WordExtractor.extract_words.<locals>.<genexpr>)r   r   r   r&   r&   r'   extract_words  s   zWordExtractor.extract_words)rT   rU   rV   DEFAULT_X_TOLERANCEr   r   rA   r   r   rX   r   r(   r   r   r   r   r   r   r   r   r
   r   rZ   r   r   r&   r&   r&   r'   r   $  sd    

	


K
#
r   r7   kwargsr   c                 K   s   t di || S )Nr&   )r   r   )r7   r   r&   r&   r'   r     s   r   c                    sV     ddi tdi  fddtD }|| }|jdi  fddtD }|S )Nrf   Tc                       i | ]}| v r| | qS r&   r&   r.   kr   r&   r'   
<dictcomp>       z$chars_to_textmap.<locals>.<dictcomp>c                    r   r&   r&   r   r   r&   r'   r     r   r&   )updater   WORD_EXTRACTOR_KWARGSr   r   TEXTMAP_KWARGS)r7   r   	extractorwordmaptextmapr&   r   r'   chars_to_textmap  s   
r   c                    s   t | } t| dkrdS  drt| fi  jS  dt}tdi  fddtD }|| }t	|t
d|}dd	d
 |D S )Nr   r    r[   rd   c                    r   r&   r&   r   r   r&   r'   r     r   z extract_text.<locals>.<dictcomp>rk   ru   c                 s   s$    | ]}d  dd |D V  qdS )rh   c                 s   s    | ]}|d  V  qdS r   r&   )r.   r   r&   r&   r'   r     r   z)extract_text.<locals>.<genexpr>.<genexpr>N)r!   )r.   liner&   r&   r'   r     s   " zextract_text.<locals>.<genexpr>r&   )r   rv   r   r   r#   r   r   r   r   r   r   r!   )r7   r   rd   r   wordslinesr&   r   r'   extract_text
  s   

r   
line_chars	tolerancec                 C   sV   d}d }t | tddD ]}|d ur|d || kr|d7 }|d }||d 7 }q|S )Nr    r2   rq   rh   r4   r/   )ry   r   )r   r   colllast_x1r   r&   r&   r'   collate_line  s   r   r   rd   c                    s(   t | td|}d fdd|D S )Nrk   ru   c                 3   s    | ]}t | V  qd S r-   )r   r   r   r&   r'   r   2  s    z&extract_text_simple.<locals>.<genexpr>)r   r   r!   )r7   r   rd   	clusteredr&   r   r'   extract_text_simple,  s   r   c                    sR   t dddd t dddtdttd	d	f f fd
d}|| }t|| jdS )u   
    Removes duplicate chars — those sharing the same text, fontname, size,
    and positioning (within `tolerance`) as other characters in the set.
    fontnamesizer   r/   rk   r2   r7   r   Nc                 3   sl    t |  d}tj| dD ]%\}}tt|tdD ]}t|tdD ]}t |dd V  q&qqd S )Nrq   rk   r2   r   )ry   r   r   r   r   r   )r7   sorted_charsgrp	grp_chars	y_cluster	x_clusterrr   pos_keyr   r&   r'   yield_unique_chars=  s   
z(dedupe_chars.<locals>.yield_unique_charsrq   )r   r   r   r   ry   index)r7   r   r   dedupedr&   r   r'   dedupe_chars5  s
   
$r   )r   )0inspectr   rG   r   operatorr   typingr   r   r   r   r   r   r	   r
   r   _typingr   r   r   r   
clusteringr   genericr   geometryr   r   r   r   r   rw   r   rZ   r   r   	signaturer   
parameterskeysr   r   r   rX   r   r   r   r   r&   r&   r&   r'   <module>   st    ,c % Q


	