3 ig,@sddlmZmZmZddlmZddlmZm Z ddl m Z ddl m Z ddl mZddl mZmZdd l mZmZmZdd l mZmZdd l mZdd lmZdd lmZeeZe dkreZne ZGdddeZdS))absolute_importdivisionunicode_literals)unichr)deque OrderedDict) version_info)spaceCharacters)entities) asciiLettersasciiUpper2Lower)digits hexDigitsEOF) tokenTypes tagTokenTypes)replacementCharacters)HTMLInputStream)TriecsdeZdZdZdfdd ZddZddZdd d Zd d ZddZ ddZ ddZ ddZ ddZ ddZddZddZddZd d!Zd"d#Zd$d%Zd&d'Zd(d)Zd*d+Zd,d-Zd.d/Zd0d1Zd2d3Zd4d5Zd6d7Zd8d9Zd:d;Zdd?Z!d@dAZ"dBdCZ#dDdEZ$dFdGZ%dHdIZ&dJdKZ'dLdMZ(dNdOZ)dPdQZ*dRdSZ+dTdUZ,dVdWZ-dXdYZ.dZd[Z/d\d]Z0d^d_Z1d`daZ2dbdcZ3dddeZ4dfdgZ5dhdiZ6djdkZ7dldmZ8dndoZ9dpdqZ:drdsZ;dtduZdzd{Z?d|d}Z@d~dZAddZBddZCddZDddZEddZFddZGddZHddZIddZJddZKddZLZMS) HTMLTokenizera  This class takes care of tokenizing HTML. * self.currentToken Holds the token that is currently being processed. * self.state Holds a reference to the method to be invoked... XXX * self.stream Points to HTMLInputStream object. Nc sFt|f||_||_d|_g|_|j|_d|_d|_t t |j dS)NF) rstreamparser escapeFlag lastFourChars dataStatestateescape currentTokensuperr__init__)selfrrkwargs) __class__>/tmp/pip-build-88gy_88q/pip/pip/_vendor/html5lib/_tokenizer.pyr"(szHTMLTokenizer.__init__ccs\tg|_xL|jrVx&|jjr:td|jjjddVqWx|jrR|jjVq>Wq WdS)z This is where the magic happens. We do our usually processing through the states and when we have a token to return we yield the token which pauses processing until the next token is requested. ParseErrorr)typedataN)r tokenQueuerrerrorsrpoppopleft)r#r&r&r'__iter__7s    zHTMLTokenizer.__iter__c %Cs(t}d}|rt}d}g}|jj}x(||krJ|tk rJ|j||jj}q$Wtdj||}|tkrt|}|j jt ddd|idnld|kod kns|d krd }|j jt ddd|idn(d |kod knsd|kodknsd|kodknsd|ko4dkns|t ddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d g#kr|j jt ddd|idy t |}Wn>t k r|d6}t d|d?Bt d7|d8@B}YnX|d9kr$|j jt dd:d;|jj||S)r(z'expected-tag-name-but-got-right-bracket)r)r*rRz<>?z'expected-tag-name-but-got-question-markzexpected-tag-namerLT)rr<markupDeclarationOpenStatercloseTagOpenStater rr tagNameStater+r=rrCbogusCommentState)r#r*r&r&r'rnws6               zHTMLTokenizer.tagOpenStatecCs|jj}|tkr0td|gdd|_|j|_n|dkrX|jjtddd|j |_nn|t kr|jjtddd|jjtd d d|j |_n0|jjtdd d |id |jj ||j |_dS)NrdF)r)rbr*rer|r(z*expected-closing-tag-but-got-right-bracket)r)r*z expected-closing-tag-but-got-eofrRz|tkr|jjtdd d|j |_n|jjtd|dd S) NrrR)r)r*rLrlr(zinvalid-codepointu�zeof-in-script-in-scriptT) rr<r+r=r scriptDataDoubleEscapedDashStater(scriptDataDoubleEscapedLessThanSignStaterr)r#r*r&r&r'rs$          z*HTMLTokenizer.scriptDataDoubleEscapedStatecCs|jj}|dkr2|jjtddd|j|_n|dkrZ|jjtddd|j|_n|dkr|jjtddd|jjtddd|j|_nF|t kr|jjtdd d|j |_n|jjtd|d|j|_d S) NrrR)r)r*rLrlr(zinvalid-codepointu�zeof-in-script-in-scriptT) rr<r+r=r$scriptDataDoubleEscapedDashDashStaterrrrr)r#r*r&r&r'rs(           z.HTMLTokenizer.scriptDataDoubleEscapedDashStatecCs|jj}|dkr*|jjtdddn|dkrR|jjtddd|j|_n|dkrz|jjtddd|j|_n|dkr|jjtddd|jjtdd d|j|_nF|t kr|jjtdd d|j |_n|jjtd|d|j|_d S) NrrR)r)r*rLr|rlr(zinvalid-codepointu�zeof-in-script-in-scriptT) rr<r+r=rrrrwrrr)r#r*r&r&r'r%s,           z2HTMLTokenizer.scriptDataDoubleEscapedDashDashStatecCsP|jj}|dkr8|jjtdddd|_|j|_n|jj||j |_dS)NrzrR)r)r*r2T) rr<r+r=rrscriptDataDoubleEscapeEndStaterrCr)r#r*r&r&r'r>s   z6HTMLTokenizer.scriptDataDoubleEscapedLessThanSignStatecCs|jj}|ttdBkrR|jjtd|d|jjdkrH|j |_ q|j |_ nB|t kr|jjtd|d|j|7_n|jj ||j |_ dS)Nrzr|rR)r)r*rT)rzr|)rr<r r@r+r=rrrrrrr rC)r#r*r&r&r'rIs    z,HTMLTokenizer.scriptDataDoubleEscapeEndStatecCs0|jj}|tkr$|jjtdn|tkrJ|jdj|dg|j|_n|dkr\|j n|dkrn|j |_n|dkr|j jt d d d |jdj|dg|j|_n|d kr|j jt d dd |jdjddg|j|_nF|t kr|j jt d dd |j|_n|jdj|dg|j|_dS)NTr*r2r|rz'"rPrLr(z#invalid-character-in-attribute-name)r)r*rlzinvalid-codepointu�z#expected-attribute-name-but-got-eof)rrrPrL)rr<r ror r r=attributeNameStaterrkrr+rrr)r#r*r&r&r'rYs6              z&HTMLTokenizer.beforeAttributeNameStatecCs|jj}d}d}|dkr&|j|_n0|tkr^|jddd||jjtd7<d}n|dkrld}n|tkr~|j|_n|dkr|j |_n|d kr|j j t d d d |jdddd 7<d}n|dkr|j j t d dd |jddd|7<d}nH|t kr8|j j t d dd |j|_n|jddd|7<d}|r|jdddjt|jddd<xP|jdddD]:\}}|jddd|kr|j j t d dd PqW|r|jdS)NTFrPr*r rr|rzrlr(zinvalid-codepoint)r)r*u�rrrLz#invalid-character-in-attribute-namezeof-in-attribute-namezduplicate-attributerSrS)rrrLrSrSrSrSrSrS)rr<beforeAttributeValueStaterr r ror afterAttributeNameStaterr+r=rrrrfr rk)r#r*leavingThisState emitTokenrb_r&r&r'rwsR             &  z HTMLTokenizer.attributeNameStatecCsF|jj}|tkr$|jjtdn|dkr8|j|_n |dkrJ|jn|tkrp|jdj |dg|j |_n|dkr|j |_n|dkr|j j t dd d |jdj d dg|j |_n|dkr|j j t ddd |jdj |dg|j |_nF|tkr&|j j t ddd |j|_n|jdj |dg|j |_dS)NTrPr|r*r2rzrlr(zinvalid-codepoint)r)r*u�rrrLz&invalid-character-after-attribute-namezexpected-end-of-tag-but-got-eof)rrrL)rr<r rorrrkr r r=rrr+rrr)r#r*r&r&r'rs:                z%HTMLTokenizer.afterAttributeNameStatecCsj|jj}|tkr$|jjtdnB|dkr8|j|_n.|dkrX|j|_|jj|n|dkrl|j|_n|dkr|j j t ddd|j n|d kr|j j t dd d|j d dd d 7<|j|_n|dkr|j j t ddd|j d dd |7<|j|_nL|tkrD|j j t ddd|j|_n"|j d dd |7<|j|_dS)NTrrKrr|r(z.expected-attribute-value-but-got-right-bracket)r)r*rlzinvalid-codepointr*r u�rPrL`z"equals-in-unquoted-attribute-valuez$expected-attribute-value-but-got-eofrS)rPrLrrSrS)rr<r roattributeValueDoubleQuotedStaterattributeValueUnQuotedStaterCattributeValueSingleQuotedStater+r=rrkr rr)r#r*r&r&r'rs>                 z'HTMLTokenizer.beforeAttributeValueStatecCs|jj}|dkr|j|_n|dkr0|jdn|dkrj|jjtddd|jdd dd 7<nN|t kr|jjtdd d|j |_n&|jdd d||jj d7<d S)NrrKrlr(zinvalid-codepoint)r)r*r*r u�z#eof-in-attribute-value-double-quoteTrSrS)rrKrl) rr<afterAttributeValueStaterrar+r=rr rrro)r#r*r&r&r'rs         z-HTMLTokenizer.attributeValueDoubleQuotedStatecCs|jj}|dkr|j|_n|dkr0|jdn|dkrj|jjtddd|jdd dd 7<nN|t kr|jjtdd d|j |_n&|jdd d||jj d7<d S)NrrKrlr(zinvalid-codepoint)r)r*r*r u�z#eof-in-attribute-value-single-quoteTrSrS)rrKrl) rr<rrrar+r=rr rrro)r#r*r&r&r'rs         z-HTMLTokenizer.attributeValueSingleQuotedStatecCs|jj}|tkr|j|_n|dkr2|jdn|dkrD|jn|dkr~|jjt dd d |j d dd |7<n|d kr|jjt ddd |j d dd d7<nV|t kr|jjt ddd |j |_n.|j d dd ||jj tdtB7<dS)NrKr|rrrPrLrr(z0unexpected-character-in-unquoted-attribute-value)r)r*r*r rlzinvalid-codepointu�z eof-in-attribute-value-no-quotesT)rrrPrLrrSrSrS)rKr|rrrPrLrrl)rr<r rrrarkr+r=rr rrror@)r#r*r&r&r'rs,           z)HTMLTokenizer.attributeValueUnQuotedStatecCs|jj}|tkr|j|_n|dkr.|jnp|dkr@|j|_n^|tkrt|jj t ddd|jj ||j |_n*|jj t ddd|jj ||j|_dS)Nr|rzr(z$unexpected-EOF-after-attribute-value)r)r*z*unexpected-character-after-attribute-valueT) rr<r rrrkrrr+r=rrCr)r#r*r&r&r'r.s"           z&HTMLTokenizer.afterAttributeValueStatecCs|jj}|dkr&d|jd<|jn^|tkrZ|jjtddd|jj||j |_ n*|jjtddd|jj||j |_ dS)Nr|Trer(z#unexpected-EOF-after-solidus-in-tag)r)r*z)unexpected-character-after-solidus-in-tag) rr<r rkrr+r=rrCrrr)r#r*r&r&r'rBs          z&HTMLTokenizer.selfClosingStartTagStatecCsD|jjd}|jdd}|jjtd|d|jj|j|_dS)Nr|rlu�Comment)r)r*T) rroreplacer+r=rr<rr)r#r*r&r&r'rTs   zHTMLTokenizer.bogusCommentStatecCs|jjg}|ddkrT|j|jj|ddkrPtddd|_|j|_dSn|ddkrd}x.d&D]&}|j|jj|d'|krjd}PqjW|rtdddddd|_|j|_dSn|d(dkrH|jdk rH|jj j rH|jj j d)j |jj j krHd}x2d*D]*}|j|jj|d+|krd}PqW|rH|j |_dS|jjtdddx|rz|jj|jq`W|j|_dS),Nr rrr2)r)r*TdDoOrHCtTyYpPeEFDoctype)r)rbpublicIdsystemIdcorrect[Ar(zexpected-dashes-or-doctyperSrSrS)rrrrrHrrrrrrrrr)rrrrrrrSrSrS)rrrrrrrS)rr<r=rr commentStartStater doctypeStatertree openElements namespacedefaultNamespacecdataSectionStater+rCr-r)r#rGmatchedexpectedr&r&r'r~csR           z(HTMLTokenizer.markupDeclarationOpenStatecCs|jj}|dkr|j|_n|dkrN|jjtddd|jdd7<n|dkr|jjtdd d|jj|j|j|_nP|t kr|jjtdd d|jj|j|j|_n|jd|7<|j |_d S) Nrrlr(zinvalid-codepoint)r)r*r*u�r|zincorrect-commentzeof-in-commentT) rr<commentStartDashStaterr+r=rr rr commentState)r#r*r&r&r'rs(          zHTMLTokenizer.commentStartStatecCs|jj}|dkr|j|_n|dkrN|jjtddd|jdd7<n|dkr|jjtdd d|jj|j|j|_nT|t kr|jjtdd d|jj|j|j|_n|jdd|7<|j |_d S) Nrrlr(zinvalid-codepoint)r)r*r*u-�r|zincorrect-commentzeof-in-commentT) rr<commentEndStaterr+r=rr rrr)r#r*r&r&r'rs(          z#HTMLTokenizer.commentStartDashStatecCs|jj}|dkr|j|_n|dkrN|jjtddd|jdd7<nT|tkr|jjtddd|jj|j|j |_n|jd||jj d 7<d S) Nrrlr(zinvalid-codepoint)r)r*r*u�zeof-in-commentT)rrl) rr<commentEndDashStaterr+r=rr rrro)r#r*r&r&r'rs        zHTMLTokenizer.commentStatecCs|jj}|dkr|j|_n|dkrV|jjtddd|jdd7<|j|_nT|t kr|jjtddd|jj|j|j |_n|jdd|7<|j|_d S) Nrrlr(zinvalid-codepoint)r)r*r*u-�zeof-in-comment-end-dashT) rr<rrr+r=rr rrr)r#r*r&r&r'rs         z!HTMLTokenizer.commentEndDashStatecCs,|jj}|dkr*|jj|j|j|_n|dkrd|jjtddd|jdd7<|j|_n|dkr|jjtdd d|j |_n|d kr|jjtdd d|jd|7<nj|t kr|jjtdd d|jj|j|j|_n4|jjtdd d|jdd|7<|j|_dS)Nr|rlr(zinvalid-codepoint)r)r*r*u--�ryz,unexpected-bang-after-double-dash-in-commentrz,unexpected-dash-after-double-dash-in-commentzeof-in-comment-double-dashzunexpected-char-in-commentz--T) rr<r+r=r rrrrcommentEndBangStater)r#r*r&r&r'rs6               zHTMLTokenizer.commentEndStatecCs|jj}|dkr*|jj|j|j|_n|dkrN|jdd7<|j|_n|dkr|jjtddd|jdd 7<|j |_nT|t kr|jjtdd d|jj|j|j|_n|jdd|7<|j |_d S) Nr|rr*z--!rlr(zinvalid-codepoint)r)r*u--!�zeof-in-comment-end-bang-stateT) rr<r+r=r rrrrrr)r#r*r&r&r'rs(         z!HTMLTokenizer.commentEndBangStatecCs|jj}|tkr|j|_nj|tkr\|jjtdddd|j d<|jj|j |j |_n*|jjtddd|jj ||j|_dS)Nr(z!expected-doctype-name-but-got-eof)r)r*Frzneed-space-after-doctypeT) rr<r beforeDoctypeNameStaterrr+r=rr rrC)r#r*r&r&r'rs         zHTMLTokenizer.doctypeStatecCs|jj}|tkrn|dkrT|jjtdddd|jd<|jj|j|j|_n|dkr|jjtdddd |jd <|j |_nR|t kr|jjtdd dd|jd<|jj|j|j|_n||jd <|j |_d S) Nr|r(z+expected-doctype-name-but-got-right-bracket)r)r*Frrlzinvalid-codepointu�rbz!expected-doctype-name-but-got-eofT) rr<r r+r=rr rrdoctypeNameStater)r#r*r&r&r'r*s.              z$HTMLTokenizer.beforeDoctypeNameStatecCs|jj}|tkr2|jdjt|jd<|j|_n|dkrh|jdjt|jd<|jj |j|j |_n|dkr|jj t ddd|jdd7<|j |_nh|t kr|jj t dddd |jd <|jdjt|jd<|jj |j|j |_n|jd|7<d S) Nrbr|rlr(zinvalid-codepoint)r)r*u�zeof-in-doctype-nameFrT)rr<r r rfr afterDoctypeNameStaterr+r=rrrr)r#r*r&r&r'rDs,          zHTMLTokenizer.doctypeNameStatecCsR|jj}|tkrn8|dkr8|jj|j|j|_n|tkrd|jd<|jj ||jjt ddd|jj|j|j|_n|d!krd }x$d'D]}|jj}||krd}PqW|r|j |_d SnJ|d(krd }x(d.D] }|jj}||krd}PqW|r|j |_d S|jj ||jjt ddd|id d|jd<|j |_d S)/Nr|Frr(zeof-in-doctype)r)r*rrTuUbBlLiIrHrsSrrrrrrmMz*expected-space-or-right-bracket-in-doctyper*)r)r*r4)rrrrrrrrrrrHr)rrrrr)rrrrrrrrrrrr)rrrrr)rr<r r+r=r rrrrCrafterDoctypePublicKeywordStateafterDoctypeSystemKeywordStatebogusDoctypeState)r#r*rrr&r&r'r]sT              z#HTMLTokenizer.afterDoctypeNameStatecCs|jj}|tkr|j|_n|d krP|jjtddd|jj||j|_nT|t kr|jjtdddd|j d<|jj|j |j |_n|jj||j|_d S) Nrrr(zunexpected-char-in-doctype)r)r*zeof-in-doctypeFrT)rr) rr<r "beforeDoctypePublicIdentifierStaterr+r=rrCrr r)r#r*r&r&r'rs"           z,HTMLTokenizer.afterDoctypePublicKeywordStatecCs|jj}|tkrn|dkr0d|jd<|j|_n|dkrLd|jd<|j|_n|dkr|jjt dddd |jd <|jj|j|j |_nh|t kr|jjt dd dd |jd <|jj|j|j |_n(|jjt dd dd |jd <|j |_d S)Nrr2rrr|r(zunexpected-end-of-doctype)r)r*Frzeof-in-doctypezunexpected-char-in-doctypeT) rr<r r (doctypePublicIdentifierDoubleQuotedStater(doctypePublicIdentifierSingleQuotedStater+r=rrrr)r#r*r&r&r'rs4                z0HTMLTokenizer.beforeDoctypePublicIdentifierStatecCs|jj}|dkr|j|_n|dkrN|jjtddd|jdd7<n|dkr|jjtdd dd |jd <|jj|j|j|_nR|t kr|jjtdd dd |jd <|jj|j|j|_n|jd|7<d S)Nrrlr(zinvalid-codepoint)r)r*ru�r|zunexpected-end-of-doctypeFrzeof-in-doctypeT) rr<!afterDoctypePublicIdentifierStaterr+r=rr rr)r#r*r&r&r'rs*            z6HTMLTokenizer.doctypePublicIdentifierDoubleQuotedStatecCs|jj}|dkr|j|_n|dkrN|jjtddd|jdd7<n|dkr|jjtdd dd |jd <|jj|j|j|_nR|t kr|jjtdd dd |jd <|jj|j|j|_n|jd|7<d S)Nrrlr(zinvalid-codepoint)r)r*ru�r|zunexpected-end-of-doctypeFrzeof-in-doctypeT) rr<rrr+r=rr rr)r#r*r&r&r'rs*            z6HTMLTokenizer.doctypePublicIdentifierSingleQuotedStatecCs |jj}|tkr|j|_n|dkr<|jj|j|j|_n|dkrn|jjt dddd|jd<|j |_n|dkr|jjt dddd|jd<|j |_nh|t kr|jjt dd dd |jd <|jj|j|j|_n(|jjt dddd |jd <|j |_d S) Nr|rr(zunexpected-char-in-doctype)r)r*r2rrzeof-in-doctypeFrT)rr<r -betweenDoctypePublicAndSystemIdentifiersStaterr+r=r rr(doctypeSystemIdentifierDoubleQuotedState(doctypeSystemIdentifierSingleQuotedStaterr)r#r*r&r&r'rs6                  z/HTMLTokenizer.afterDoctypePublicIdentifierStatecCs|jj}|tkrn|dkr4|jj|j|j|_n|dkrPd|jd<|j|_n|dkrld|jd<|j |_nh|t kr|jjt dddd |jd <|jj|j|j|_n(|jjt dd dd |jd <|j |_d S) Nr|rr2rrr(zeof-in-doctype)r)r*Frzunexpected-char-in-doctypeT) rr<r r+r=r rrrrrrr)r#r*r&r&r'rs.             z;HTMLTokenizer.betweenDoctypePublicAndSystemIdentifiersStatecCs|jj}|tkr|j|_n|d krP|jjtddd|jj||j|_nT|t kr|jjtdddd|j d<|jj|j |j |_n|jj||j|_d S) Nrrr(zunexpected-char-in-doctype)r)r*zeof-in-doctypeFrT)rr) rr<r "beforeDoctypeSystemIdentifierStaterr+r=rrCrr r)r#r*r&r&r'r)s"           z,HTMLTokenizer.afterDoctypeSystemKeywordStatecCs|jj}|tkrn|dkr0d|jd<|j|_n|dkrLd|jd<|j|_n|dkr|jjt dddd |jd <|jj|j|j |_nh|t kr|jjt dd dd |jd <|jj|j|j |_n(|jjt dddd |jd <|j |_d S) Nrr2rrr|r(zunexpected-char-in-doctype)r)r*Frzeof-in-doctypeT) rr<r r rrrr+r=rrrr)r#r*r&r&r'r=s4                z0HTMLTokenizer.beforeDoctypeSystemIdentifierStatecCs|jj}|dkr|j|_n|dkrN|jjtddd|jdd7<n|dkr|jjtdd dd |jd <|jj|j|j|_nR|t kr|jjtdd dd |jd <|jj|j|j|_n|jd|7<d S)Nrrlr(zinvalid-codepoint)r)r*ru�r|zunexpected-end-of-doctypeFrzeof-in-doctypeT) rr<!afterDoctypeSystemIdentifierStaterr+r=rr rr)r#r*r&r&r'rZs*            z6HTMLTokenizer.doctypeSystemIdentifierDoubleQuotedStatecCs|jj}|dkr|j|_n|dkrN|jjtddd|jdd7<n|dkr|jjtdd dd |jd <|jj|j|j|_nR|t kr|jjtdd dd |jd <|jj|j|j|_n|jd|7<d S)Nrrlr(zinvalid-codepoint)r)r*ru�r|zunexpected-end-of-doctypeFrzeof-in-doctypeT) rr<rrr+r=rr rr)r#r*r&r&r'rrs*            z6HTMLTokenizer.doctypeSystemIdentifierSingleQuotedStatecCs|jj}|tkrn~|dkr4|jj|j|j|_n^|tkrt|jjt dddd|jd<|jj|j|j|_n|jjt ddd|j |_dS) Nr|r(zeof-in-doctype)r)r*Frzunexpected-char-in-doctypeT) rr<r r+r=r rrrrr)r#r*r&r&r'rs         z/HTMLTokenizer.afterDoctypeSystemIdentifierStatecCsZ|jj}|dkr*|jj|j|j|_n,|tkrV|jj||jj|j|j|_ndS)Nr|T) rr<r+r=r rrrrC)r#r*r&r&r'rs    zHTMLTokenizer.bogusDoctypeStatecCsg}x|j|jjd|j|jjd|jj}|tkr@Pq|dksLt|ddddkrx|ddd|d<Pq|j|qWdj|}|jd}|dkrx&t|D]}|j jt d d d qW|j dd }|r|j jt d |d |j |_ dS)N]r|r z]]r2rlrr(zinvalid-codepoint)r)r*u�rRTrSrSrrS)r=rror<rAssertionErrorr?countranger+rrrr)r#r*r< nullCountrr&r&r'rs0        zHTMLTokenizer.cdataSectionState)N)NF)N__name__ __module__ __qualname____doc__r"r/rJr`rarkrrmrsrqrurwrxrnrrrrrrrtrrrvrrrrrrrrrrrrrrrrrrrrrrrrrrr~rrrrrrrrrrrrrrrrrrrrrrr __classcell__r&r&)r%r'rs H P#         6 "-3rN)rr) __future__rrrpip._vendor.sixrrA collectionsrrsysr constantsr r r r rrrrrr _inputstreamr_trierrTdictrgobjectrr&r&r&r's