#include "ehtml.h"
#include "logger.h"
#include "efile.h"
#include "eregexp.h"
inline bool isURLUnreserved(char c)
{
return( (c>='a'&&c<='z') ||
(c>='A'&&c<='Z') ||
(c>='0'&&c<='9') ||
c=='-'||c=='_' ||
c=='.'||c=='~' );
}
inline estr chartohex(char c)
{
estr res;
res.reserve(256);
sprintf(res._str,"%hhx",c);
res._strlen=strlen(res._str);
return(res);
}
#define H2C(c) if (c >= '0' && c <= '9') \
res=res|(c-'0'); \
else if (c >= 'a' && c <= 'f') \
res=res|(c-'a'+10u); \
else if (c >= 'A' && c <= 'Z') \
res=res|(c-'A'+10u);
inline char hextochar(char c,char c2)
{
char res;
res=0x00;
H2C(c);
res=res<<4;
H2C(c2);
return(res);
}
const unsigned long safe_shift64[64]={0x0ul,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful,0xfffffffffffffffful};
estr base64chars("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=");
estr base64encode(const estr& str)
{
estr outstr;
if (str.len()==0)
return(outstr);
unsigned char *pstr=reinterpret_cast(str._str);
uint64_t tmp=0x0ul;
outstr.reserve(((str.len()+2)/3)*4);
outstr._strlen=((str.len()+2)/3)*4;
// outstr.reserve((str.len()*8+9)/10);
// outstr._strlen=(str.len()*8+9)/10;
int i=0,ji,jo=0;
/*
for (; i+64u>(i%64u);
tmp|=(p[i/64u+1u]<<(64u-(i%64u)))&safe_shift64[i%64u];
for (ji=0; ji<64u; ji+=6u,tmp>>=6u)
outstr[jo++]=base64chars[tmp&0x3F];
}
*/
for (; i+8u>10u)&0x3F];
// cout << int((tmp>>10u)&0x3F) << " " << base64chars[(tmp>>10u)&0x3F] << endl;
}
tmp=uint64_t(pstr[i/8u])<<(8u+i%8u);
outstr[jo++]=base64chars[(tmp>>10u)&0x3F];
// cout << int((tmp>>10u)&0x3F) << " " << base64chars[(tmp>>10u)&0x3F] << endl;
i+=6u; tmp<<=6u;
if (i>10u)&0x3F];
// cout << int((tmp>>10u)&0x3F) << " " << base64chars[(tmp>>10u)&0x3F] << endl;
}
while (jo=>,&=&");
estr html_entities(const estr& str)
{
estr tmpstr(str);
int i;
for (i=0; i0){
tmpstr+=str.substr(i,b-i);
ihe=htmlchars.find(str.substr(b,e-b));
if (ihe==-1) { lwarn("unknown html entity char: "+str.substr(b,e-b)); tmpstr+=str.substr(b,e-b); }
else tmpstr+=htmlchars.keys(ihe);
i=e;
}
tmpstr+=str.substr(i);
return(tmpstr);
}
inline void skip_until2(const estr& str,const char *c,int &i){
const char *p;
for(;i /",i);
tag.type=str.substr(s,i-s).lowercase();
if (i>=str.len()){ lerror("unexpected EOF: tag missing closing >"); return; }
if (tag.type.substr(0,3)=="!--"){ tag.type="!--"; i=s+3; return; }
if (tag.type.substr(0,8)=="!doctype"){ tag.type="!doctype"; i=s+8; return; }
while (i=str.len()) { lerror("unexpected EOF: tag missing closing >"); return; }
s=i;
skip_until2(str," =>/",i);
argname=str.substr(s,i-s).lowercase();
// if (tag.type=="body") cout << "argname: "<=str.len()){ lerror("unexpected EOF: tag missing closing >"); return; }
if (str._str[i]=='>') break;
if (str._str[i]=='/') {
++i;
tag.single=true;
skip_blank(str,i);
if (str._str[i]!='>') lerror("Missing closing > in tag: "+tag.type);
break;
}
if (str._str[i]=='>') { tag.args.add(argname,""); lwarn("tagarg missing =value part"); break; }
if (str._str[i]!='=') { tag.args.add(argname,""); lwarn("tagarg missing =value part"); continue; }
++i;
skip_blank(str,i);
if (i>=str.len()) { tag.args.add(argname,""); lerror("unexpected EOF: tag missing closing >"); break; }
if (str._str[i]=='>') { tag.args.add(argname,""); lwarn("tagarg missing =value part"); break; }
s=i;
if (str._str[i]=='"' || str._str[i]=='\''){
char delim=str._str[i];
++s; ++i;
for (;i",i);
argvalue=str.substr(s,i-s);
}
// if (tag.type=="body") cout << "argval: "<0 && tag.type[0]!='/' && str.ifind(""+tag.type+">",i)==-1){
lwarn("tag: "+tag.type+" missing closing /tag, assuming single tag");
tag.single=true;
}
}
/*
ehtmltag::~ehtmltag()
{
int i;
for (i=0; itext=text;
if (i==-1)
parts.addref(ntag);
else
parts.insertref(i,ntag);
}
void ehtmltag::addTag(const ehtmltag& tag)
{
ehtmltag *ntag=new ehtmltag;
ntag->single=tag.single;
ntag->text=tag.text;
ntag->type=tag.type;
ntag->args=tag.args;
int i;
for (i=0; iaddTag(tag.parts[i]);
parts.addref(ntag);
}
estrarray containingTypes("html,body,div,center,p,span,h1,h2,h3,h4,h5,h6,table,tr,td,ul,ol,li,b,u,i,em,iframe,blockquote,form,font,small,strong");
void ehtmltag::parse(ehtml& html,const estr& data,int &i)
{
ehtmltag *tag;
int s,ts;
ldinfo(" >> "+type);
if (type=="script" || type=="style"){
s=i;
i=data.ifind(""+type+">",i);
if (i==-1) { lerror("end "+type+"> not found in document"); return; }
addText(data.substr(s,i-s));
tag=new ehtmltag;
tag_get(data,i,*tag);
delete tag;
++i; // skip the > character
return;
}
if (type=="!--"){
s=i;
i=data.find("-->",i);
if (i==-1) { lerror("closing --> not found in document"); return; }
addText(data.substr(s,i-s));
i+=3; // skip the --> closing tag
return;
}
if (type=="!doctype"){
s=i;
i=data.find(">",i);
if (i==-1) { lerror("closing > for !doctype tag not found in document"); return; }
addText(data.substr(s,i-s));
i+=1; // skip the > closing tag
cout << data.substr(s,i-s) << endl;
return;
}
if (type.len()>0 && type[0]=='/'){ lerror("problem in parsing, found unhandled closing tag: "+type); }
while (itype=="a")
html.links.add(tag);
else if (tag->type=="form")
html.forms.add(tag);
else if (tag->type=="script")
html.scripts.add(tag);
else if (tag->type=="link")
html.css.add(tag);
else if (tag->type=="img")
html.images.add(tag);
else if (tag->type=="head")
html.head=tag;
else if (tag->type=="body")
html.body=tag;
if (tag->args.findkey("id")!=-1)
html.tags.add(tag->args["id"],tag);
++i;
if (tag->type.len() && tag->type[0]=='/' && tag->type.substr(1) == type)
{ ldinfo(" << "+type); delete tag; return; }
if (single && tag->type.len() && tag->type[0]=='/')
{ ldinfo(" << "+type); i=ts; delete tag; return; }
if (tag->type.len() && tag->type[0]=='/')
{ lerror("Unexpected end tag: "+tag->type+" in tag: "+type); i=ts; delete tag; return; }
if (!tag->single || containingTypes.find(tag->type)!=-1)
tag->parse(html,data,i);
parts.addref(tag);
}
if (i";
return;
}
if (type=="!doctype"){
odata+="";
return;
}
odata+="<"+type;
for (i=0; i";
}else
odata+=">";
for (i=0; i";
}
void ehtml::clear()
{
images.clear();
links.clear();
scripts.clear();
css.clear();
head=0x00;
body=0x00;
parts.clear();
}
void ehtml::parse(const estr& data)
{
int i;
clear();
cout << "ehtml::parse, data.len=" << data.len() << endl;
i=0;
ehtmltag::parse(*this,data,i);
}
void ehtml::load(const estr& filename)
{
efile file(filename,"r");
if (!file.exists()) { lerror("html file not found: "+filename); return; }
estr data;
file.read(data);
parse(data);
}
void ehtml::save(const estr& filename)
{
efile file(filename,"w");
estr data(make());
file.write(data);
file.close();
}
estr ehtml::make()
{
int i;
estr odata;
for (i=0; i