Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
yiming-wange
GitHub Repository: yiming-wange/cs224n-2023-solution
Path: blob/main/a3/utils/__pycache__/parser_utils.cpython-310.pyc
1003 views
o

a��c�?�@s�dZddlZddlZddlZddlmZddlmZddlm	Z	ddl
m
Z
ddlZddlZ
dZd	Zd
ZdZdZGd
d�de�ZGdd�de�ZGdd�de�Zd"dd�Zd#dd�Zdd�Zdd�Zd$dd�ZGdd �d e�Zed!krv	dSdS)%z�
CS224N 2021-2022: Homework 3
parser_utils.py: Utilities for training the dependency parser.
Sahil Chopra <[email protected]>
�N)�Counter�)�get_minibatches)�minibatch_parse)�tqdmz<p>:z<l>:z<UNK>z<NULL>z<ROOT>c@sBeZdZdZdZdZdZdZdZeoeZdZ	dZ
dZdZdZ
dS)	�Config�englishTz./dataztrain.conllz	dev.conllz
test.conllz./data/en-cw.txtN)�__name__�
__module__�__qualname__�language�
with_punct�	unlabeled�	lowercase�use_pos�use_dep�	data_path�
train_file�dev_file�	test_file�embedding_file�rr�D/Users/yimingwang/Desktop/cs224n/assignment/a3/utils/parser_utils.pyrs
rc@sJeZdZdZdd�Zdd�Zdd�Zdd	�Zd
d�Zdd
�Z	ddd�Z
dS)�ParserzWContains everything needed for transition-based dependency parsing except for the modelcs>tdd�|D��}t|�}t|�dkrt�d�t�|�|��dd�_�jgtt�fdd�|D���}dd�t|�D�}t|�|t	t
<�_t�}|j
�_
|j�_|j�_|j�_|j�_�j
rmgd	�}d�_nd
d�|D�dd�|D�dg}t|��_t|��_d
d�t|�D��_dd�t|�D��_|�tdd�|D�t|�d��t|�|tt<�_t|�|tt
<�_t|�|tt<�_|�tdd�|D�t|�d��t|�|t<�_t|�|t
<�_
t|�|t<�_|�_dd�|��D��_ d|j�rdnd|j�rdnd�_!t|��_"dS)NcSs4g|]}t|d|d�D]
\}}|dkr
|�q
qS)�head�labelr)�zip)�.0�ex�h�lrrr�
<listcomp>.s��z#Parser.__init__.<locals>.<listcomp>rz!Warning: more than one root labelrcs(g|]}|dD]	}|�jkr|�qqS)r)�
root_label�rr�w��selfrrr!5s�
�cSsi|]	\}}t||�qSr)�L_PREFIX)r�ir rrr�
<dictcomp>8sz#Parser.__init__.<locals>.<dictcomp>)�L�R�ScS�g|]}d|�qS)zL-r�rr rrrr!F�cSr-)zR-rr.rrrr!Fr/r,cS�i|]\}}||�qSrr�rr(�trrrr)J�cSsi|]\}}||�qSrrr1rrrr)Kr3cSs"g|]
}|dD]}t|�qqS��pos)�P_PREFIXr#rrrr!Ns"��offsetcSsg|]}|dD]}|�qqS��wordrr#rrrr!U�cSr0rr)r�k�vrrrr)\r3��)#�listr�len�logging�info�most_commonr"�set�	enumerater'�NULL�L_NULLrrr
rrr�n_deprel�n_transZtran2idZid2tran�update�
build_dictr6�UNK�P_UNK�P_NULL�ROOT�P_ROOT�tok2id�items�id2tok�
n_features�n_tokens)r&�datasetZroot_labels�counterZdeprelrR�config�transrr%r�__init__-sN

""

��&zParser.__init__cs�g}|D]?}�jg�fdd�|dD�}�jg�fdd�|dD�}dg|d}dg�fdd�|d	D�}|�||||d
��q|S)Ncs&g|]}|�jvr�j|n�j�qSr)rRrM�rr$r%rrr!ds
��z$Parser.vectorize.<locals>.<listcomp>r:cs.g|]}t|�jvr�jt|n�j�qSr)r6rRrNr\r%rrr!fs
��r5�����rcs,g|]}t|�jvr�jt|nd�qS)r])r'rRr\r%rrr!is
��r�r:r5rr)rPrQ�append)r&�examplesZvec_examplesrr:r5rrrr%r�	vectorizeas ���
�zParser.vectorizecs�|ddkr
d|d<�fdd�}�fdd�}g}g}|jgdt|��fdd	�|d
d�D�}	|	�fdd	�|dd�D�|jgdt|�7}	|jr}|jgdt|��fdd	�|d
d�D�}|�fd
d	�|dd�D�|jgdt|�7}td�D�]�}
|
t|�k�r6||
d}||�}||�}
t|�dkr�||d�ng}t|
�dkr�||
d�ng}|	�t|�dkrƈd|dn|j�|	�t|
�dkrڈd|
dn|j�|	�t|�dkr�d|dn|j�|	�t|
�dk�r�d|
dn|j�|	�t|�dk�r�d|dn|j�|	�t|�dk�r-�d|dn|j�|j�r�|�t|�dk�rF�d|dn|j�|�t|
�dk�r[�d|
dn|j�|�t|�dk�rp�d|dn|j�|�t|
�dk�r��d|
dn|j�|�t|�dk�r��d|dn|j�|�t|�dk�r��d|dn|j�|j�r5|�t|�dk�rȈd|dn|j�|�t|
�dk�r݈d|
dn|j�|�t|�dk�r�d|dn|j�|�t|
�dk�r�d|
dn|j�|�t|�dk�r�d|dn|j�|�t|�dk�r1�d|dn|j�q�|	|jgd7}	|j�rJ||jgd7}|j�rV||jgd7}q�|	||7}	t|	�|jk�sgJ�|	S)NrrPcst�fdd��D��S)Ncs,g|]}|d�kr|d�kr|d�qS�rrr�rZarc�r<rrr!t�,z;Parser.extract_features.<locals>.get_lc.<locals>.<listcomp>��sortedrd��arcsrdr�get_lcssz'Parser.extract_features.<locals>.get_lccst�fdd��D�dd�S)Ncs,g|]}|d�kr|d�kr|d�qSrbrrcrdrrr!wrez;Parser.extract_features.<locals>.get_rc.<locals>.<listcomp>T)�reverserfrdrhrdr�get_rcvs�z'Parser.extract_features.<locals>.get_rc�c�g|]}�d|�qSr9r�r�x�rrrr!|�z+Parser.extract_features.<locals>.<listcomp>�����crnr9rrorqrrr!}rrcrnr4rrorqrrr!rrcrnr4rrorqrrr!�rr�rr:r5r�)	rGrArrO�ranger_rrHrU)r&�stack�bufrirrjrlZ
p_featuresZ
l_featuresZfeaturesr(r<�lc�rcZllcZrrcr)rirr�extract_featuresos`.2.2(((***************��zParser.extract_featuresc	s<t|�dkr|jdS|d�|d}�d�}�d|}�d�}�d|}|jrX|dkr8|�kr8dS|dkrN||krNt��fdd	�|D��sNdSt|�dkrVdSdS|dkrm|�krm|dkrk||jkrk|SdS|dkr�||kr�t��fd
d	�|D��s�|dkr�||jkr�||jSdSt|�dkr�dS|jdS)Nrtrr]�����rrrc� g|]}�d|�kr|�qS�rrro�r�i0rrr!�� z%Parser.get_oracle.<locals>.<listcomp>cr}r~rrorrrr!�r�)rArJr�anyrI)	r&rwrxr�i1�h0Zh1Zl0�l1rrr�
get_oracle�s.
�� zParser.get_oraclec
CsPg}d}t|�D]�\}}t|d�d}dg}dd�t|�D�}g}	g}
t|d�D]r}|�|||�}|dur9nl|�||�}
|
|dksGJ�|
�|�|||	|�|
|f�||jdkrj|�|d�|dd�}q*||jkr�|	�|d|d|f�|dd�|dg}q*|	�|d|d||jf�|dd�}q*|d7}||
7}q|S)	Nrr:rcS�g|]}|d�qS�rr)rr(rrrr!�r/z+Parser.create_instances.<locals>.<listcomp>rtr]r|)	rFrArvr��legal_labelsr_r{rJrI)r&r`Z
all_instances�succ�idr�n_wordsrwrxri�	instancesr(Zgold_tr�rrr�create_instances�s:�
�zParser.create_instancescCsft|�dkr	dgndg|j}|t|�dkrdgndg|j7}|t|�dkr-dg7}|Sdg7}|S)Nrtrr)rArI)r&rwrx�labelsrrrr��s"�zParser.legal_labels�c	Cs�g}i}t|�D] \}}t|d�d}dd�t|�D�}|�|�||t|�<qt|||�}	t||	|�}
d}}tt|�d��~}
t|�D]q\}}dgt|d�}|
|D]\}}|||<qVt|dd�|ddd�|d	dd�|d
dd��D]2\}}}}|j	|�
t�s�J�|j	|tt�d�}|js�t
|j|�s�|||kr�dnd7}|d7}q||
�|d�qEWd�n1s�wY||}||
fS)Nr:rcSr�r�r)r�jrrrr!�r/z Parser.parse.<locals>.<listcomp>g)�totalr]rrr5r)rFrArvr_r��ModelWrapperrrrrT�
startswithr6r
�punctrrK)r&rWZeval_batch_sizeZ	sentences�sentence_id_to_idxr(�exampler��sentence�model�dependencies�UASZ
all_tokens�progrrrr2Zpred_hZgold_hZgold_lr5Zpos_strrrr�parse�s8

8����
zParser.parseN)r�)r	r
r�__doc__r[rar{r�r�r�r�rrrrr*s4<"rc@seZdZdd�Zdd�ZdS)r�cCs||_||_||_dS�N)�parserrWr�)r&r�rWr�rrrr[s
zModelWrapper.__init__cs��fdd�|D�}t�|��d�}t�|���}�fdd�|D�}�j�|�}|���	�}t�
|dt�|��d�d�}dd�|D�}|S)	Ncs6g|]}�j�|j|j|j�j�jt|j���qSr)	r�r{rw�bufferr�rWr�r�r��r�pr%rrr!s
��z(ModelWrapper.predict.<locals>.<listcomp>�int32csg|]}�j�|j|j��qSr)r�r�rwr�r�r%rrr!r;i'�float32rcSs(g|]}|dkr
dn|dkrdnd�qS)rtr,rZLAZRArr�rrrr!s()�np�array�astype�torch�
from_numpy�longr�r��detach�numpy�argmax)r&Zpartial_parsesZmb_xZmb_l�predrr%r�predicts
� zModelWrapper.predictN)r	r
rr[r�rrrrr�
sr�Fc	CsDg}t|���}ggggf\}}}}|��D]b}	|	���d�}
t|
�dkrQd|
dvrP|�|r4|
d��n|
d�|�|
d�|�t|
d��|�|
d�qt|�dkrw|�||||d	��ggggf\}}}}|durwt|�|krwnqt|�dkr�|�||||d	��Wd�|SWd�|S1s�wY|S)
N�	�
�-rr�ru�r^)�open�	readlines�strip�splitrAr_�lower�int)�in_filerZmax_exampler`�fr:r5rr�line�sprrr�
read_conll"s6
��
��
��r�csPt�}|D]
}||d7<q|dur|��n|�|�}�fdd�t|�D�S)Nrcsi|]\}}|d|��qS�rr)r�indexr$r7rrr)?r;zbuild_dict.<locals>.<dictcomp>)rrDrF)�keysZn_maxr8�count�keyZlsrr7rrL8s�rLcCsl|dkr|dvS|dkr|dkS|dkr|dkS|dkr |dvS|d	kr(|d
vS|dkr0|dkStd
|��)Nr)z''�,�.�:z``z-LRB-z-RRB-�chineseZPU�frenchZPUNC�german)z$.z$,z$[�spanish)�f0ZfaaZfat�fc�fd�feZfg�fhZfia�fit�fpZfpaZfpt�fsZft�fxZfz�	universalZPUNCTzlanguage: %s is not supported.)�
ValueError)rr5rrrr�Bsr�cCsZt�dd�|D��}t�dd�|D��}t�|jdf�}d|t�|j�|f<t||g|�S)NcS�g|]}|d�qSr�r�r�drrrr!Wr/zminibatches.<locals>.<listcomp>cSr�)rtrr�rrrr!Xr/rmr)r�r��zeros�size�aranger)�data�
batch_sizerp�y�one_hotrrr�minibatchesVs
r�TcCs(t�}td�t��}ttj�|j|j�|j	d�}ttj�|j|j
�|j	d�}ttj�|j|j�|j	d�}|rI|dd�}|dd�}|dd�}td�t��|��td�t��}t
|�}td�t��|��td�t��}i}t|j���D]}|����}	dd	�|	d
d�D�||	d<q|tjtj�dd|jd
f�dd�}
|jD] }|j|}||vr�|||
|<q�|��|vr�||��|
|<q�td�t��|��td�t��}|�|�}|�|�}|�|�}td�t��|��td�t��}|�|�}
td�t��|��||
|
||fS)NzLoading data...)ri�i�ztook {:.2f} secondszBuilding parser...z Loading pretrained embeddings...cSsg|]}t|��qSr)�floatrorrrr!yr/z,load_and_preprocess_data.<locals>.<listcomp>rrg�������?�2r�)�dtypezVectorizing data...zPreprocessing training data...)r�print�timer��os�path�joinrrrrr�formatrr�rr�r�r�r��asarray�random�normalrVrRr�rar�)�reducedrY�startZ	train_setZdev_setZtest_setr�Zword_vectorsr�r�Zembeddings_matrix�tokenr(Ztrain_examplesrrr�load_and_preprocess_data^s^���  

�



r�c@s*eZdZdZdd�Zdd�Zd
dd�Zd	S)�AverageMeterz1Computes and stores the average and current valuecCs|��dSr�)�resetr%rrrr[�szAverageMeter.__init__cCsd|_d|_d|_d|_dS�Nr)�val�avg�sumr�r%rrrr��s
zAverageMeter.resetrcCs8||_|j||7_|j|7_|j|j|_dSr�)r�r�r�r�)r&r��nrrrrK�szAverageMeter.updateNr�)r	r
rr�r[r�rKrrrrr��s
r��__main__)FNr�)T)r�r�r�rB�collectionsrZ
general_utilsrZparser_transitionsrrr�r�r�r6r'rMrGrP�objectrrr�r�rLr�r�r�r�r	rrrr�<module>s8d



5�