HTMLparser.h 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. /*
  2. * Summary: interface for an HTML 4.0 non-verifying parser
  3. * Description: this module implements an HTML 4.0 non-verifying parser
  4. * with API compatible with the XML parser ones. It should
  5. * be able to parse "real world" HTML, even if severely
  6. * broken from a specification point of view.
  7. *
  8. * Copy: See Copyright for the status of this software.
  9. *
  10. * Author: Daniel Veillard
  11. */
  12. #ifndef __HTML_PARSER_H__
  13. #define __HTML_PARSER_H__
  14. #include <libxml/xmlversion.h>
  15. #include <libxml/parser.h>
  16. #ifdef LIBXML_HTML_ENABLED
  17. #ifdef __cplusplus
  18. extern "C" {
  19. #endif
  20. /*
  21. * Most of the back-end structures from XML and HTML are shared.
  22. */
  23. typedef xmlParserCtxt htmlParserCtxt;
  24. typedef xmlParserCtxtPtr htmlParserCtxtPtr;
  25. typedef xmlParserNodeInfo htmlParserNodeInfo;
  26. typedef xmlSAXHandler htmlSAXHandler;
  27. typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
  28. typedef xmlParserInput htmlParserInput;
  29. typedef xmlParserInputPtr htmlParserInputPtr;
  30. typedef xmlDocPtr htmlDocPtr;
  31. typedef xmlNodePtr htmlNodePtr;
  32. /*
  33. * Internal description of an HTML element, representing HTML 4.01
  34. * and XHTML 1.0 (which share the same structure).
  35. */
  36. typedef struct _htmlElemDesc htmlElemDesc;
  37. typedef htmlElemDesc *htmlElemDescPtr;
  38. struct _htmlElemDesc {
  39. const char *name; /* The tag name */
  40. char startTag; /* Whether the start tag can be implied */
  41. char endTag; /* Whether the end tag can be implied */
  42. char saveEndTag; /* Whether the end tag should be saved */
  43. char empty; /* Is this an empty element ? */
  44. char depr; /* Is this a deprecated element ? */
  45. char dtd; /* 1: only in Loose DTD, 2: only Frameset one */
  46. char isinline; /* is this a block 0 or inline 1 element */
  47. const char *desc; /* the description */
  48. /* NRK Jan.2003
  49. * New fields encapsulating HTML structure
  50. *
  51. * Bugs:
  52. * This is a very limited representation. It fails to tell us when
  53. * an element *requires* subelements (we only have whether they're
  54. * allowed or not), and it doesn't tell us where CDATA and PCDATA
  55. * are allowed. Some element relationships are not fully represented:
  56. * these are flagged with the word MODIFIER
  57. */
  58. const char** subelts; /* allowed sub-elements of this element */
  59. const char* defaultsubelt; /* subelement for suggested auto-repair
  60. if necessary or NULL */
  61. const char** attrs_opt; /* Optional Attributes */
  62. const char** attrs_depr; /* Additional deprecated attributes */
  63. const char** attrs_req; /* Required attributes */
  64. };
  65. /*
  66. * Internal description of an HTML entity.
  67. */
  68. typedef struct _htmlEntityDesc htmlEntityDesc;
  69. typedef htmlEntityDesc *htmlEntityDescPtr;
  70. struct _htmlEntityDesc {
  71. unsigned int value; /* the UNICODE value for the character */
  72. const char *name; /* The entity name */
  73. const char *desc; /* the description */
  74. };
  75. #ifdef LIBXML_SAX1_ENABLED
  76. XML_DEPRECATED
  77. XMLPUBVAR const xmlSAXHandlerV1 htmlDefaultSAXHandler;
  78. #ifdef LIBXML_THREAD_ENABLED
  79. XML_DEPRECATED
  80. XMLPUBFUN const xmlSAXHandlerV1 *__htmlDefaultSAXHandler(void);
  81. #endif
  82. #endif /* LIBXML_SAX1_ENABLED */
  83. /*
  84. * There is only few public functions.
  85. */
  86. XML_DEPRECATED
  87. XMLPUBFUN void
  88. htmlInitAutoClose (void);
  89. XMLPUBFUN const htmlElemDesc *
  90. htmlTagLookup (const xmlChar *tag);
  91. XMLPUBFUN const htmlEntityDesc *
  92. htmlEntityLookup(const xmlChar *name);
  93. XMLPUBFUN const htmlEntityDesc *
  94. htmlEntityValueLookup(unsigned int value);
  95. XMLPUBFUN int
  96. htmlIsAutoClosed(htmlDocPtr doc,
  97. htmlNodePtr elem);
  98. XMLPUBFUN int
  99. htmlAutoCloseTag(htmlDocPtr doc,
  100. const xmlChar *name,
  101. htmlNodePtr elem);
  102. XML_DEPRECATED
  103. XMLPUBFUN const htmlEntityDesc *
  104. htmlParseEntityRef(htmlParserCtxtPtr ctxt,
  105. const xmlChar **str);
  106. XML_DEPRECATED
  107. XMLPUBFUN int
  108. htmlParseCharRef(htmlParserCtxtPtr ctxt);
  109. XML_DEPRECATED
  110. XMLPUBFUN void
  111. htmlParseElement(htmlParserCtxtPtr ctxt);
  112. XMLPUBFUN htmlParserCtxtPtr
  113. htmlNewParserCtxt(void);
  114. XMLPUBFUN htmlParserCtxtPtr
  115. htmlNewSAXParserCtxt(const htmlSAXHandler *sax,
  116. void *userData);
  117. XMLPUBFUN htmlParserCtxtPtr
  118. htmlCreateMemoryParserCtxt(const char *buffer,
  119. int size);
  120. XMLPUBFUN int
  121. htmlParseDocument(htmlParserCtxtPtr ctxt);
  122. XML_DEPRECATED
  123. XMLPUBFUN htmlDocPtr
  124. htmlSAXParseDoc (const xmlChar *cur,
  125. const char *encoding,
  126. htmlSAXHandlerPtr sax,
  127. void *userData);
  128. XMLPUBFUN htmlDocPtr
  129. htmlParseDoc (const xmlChar *cur,
  130. const char *encoding);
  131. XMLPUBFUN htmlParserCtxtPtr
  132. htmlCreateFileParserCtxt(const char *filename,
  133. const char *encoding);
  134. XML_DEPRECATED
  135. XMLPUBFUN htmlDocPtr
  136. htmlSAXParseFile(const char *filename,
  137. const char *encoding,
  138. htmlSAXHandlerPtr sax,
  139. void *userData);
  140. XMLPUBFUN htmlDocPtr
  141. htmlParseFile (const char *filename,
  142. const char *encoding);
  143. XMLPUBFUN int
  144. UTF8ToHtml (unsigned char *out,
  145. int *outlen,
  146. const unsigned char *in,
  147. int *inlen);
  148. XMLPUBFUN int
  149. htmlEncodeEntities(unsigned char *out,
  150. int *outlen,
  151. const unsigned char *in,
  152. int *inlen, int quoteChar);
  153. XMLPUBFUN int
  154. htmlIsScriptAttribute(const xmlChar *name);
  155. XML_DEPRECATED
  156. XMLPUBFUN int
  157. htmlHandleOmittedElem(int val);
  158. #ifdef LIBXML_PUSH_ENABLED
  159. /**
  160. * Interfaces for the Push mode.
  161. */
  162. XMLPUBFUN htmlParserCtxtPtr
  163. htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
  164. void *user_data,
  165. const char *chunk,
  166. int size,
  167. const char *filename,
  168. xmlCharEncoding enc);
  169. XMLPUBFUN int
  170. htmlParseChunk (htmlParserCtxtPtr ctxt,
  171. const char *chunk,
  172. int size,
  173. int terminate);
  174. #endif /* LIBXML_PUSH_ENABLED */
  175. XMLPUBFUN void
  176. htmlFreeParserCtxt (htmlParserCtxtPtr ctxt);
  177. /*
  178. * New set of simpler/more flexible APIs
  179. */
  180. /**
  181. * xmlParserOption:
  182. *
  183. * This is the set of XML parser options that can be passed down
  184. * to the xmlReadDoc() and similar calls.
  185. */
  186. typedef enum {
  187. HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */
  188. HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */
  189. HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */
  190. HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
  191. HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */
  192. HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */
  193. HTML_PARSE_NONET = 1<<11,/* Forbid network access */
  194. HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */
  195. HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */
  196. HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */
  197. } htmlParserOption;
  198. XMLPUBFUN void
  199. htmlCtxtReset (htmlParserCtxtPtr ctxt);
  200. XMLPUBFUN int
  201. htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,
  202. int options);
  203. XMLPUBFUN htmlDocPtr
  204. htmlReadDoc (const xmlChar *cur,
  205. const char *URL,
  206. const char *encoding,
  207. int options);
  208. XMLPUBFUN htmlDocPtr
  209. htmlReadFile (const char *URL,
  210. const char *encoding,
  211. int options);
  212. XMLPUBFUN htmlDocPtr
  213. htmlReadMemory (const char *buffer,
  214. int size,
  215. const char *URL,
  216. const char *encoding,
  217. int options);
  218. XMLPUBFUN htmlDocPtr
  219. htmlReadFd (int fd,
  220. const char *URL,
  221. const char *encoding,
  222. int options);
  223. XMLPUBFUN htmlDocPtr
  224. htmlReadIO (xmlInputReadCallback ioread,
  225. xmlInputCloseCallback ioclose,
  226. void *ioctx,
  227. const char *URL,
  228. const char *encoding,
  229. int options);
  230. XMLPUBFUN htmlDocPtr
  231. htmlCtxtParseDocument (htmlParserCtxtPtr ctxt,
  232. xmlParserInputPtr input);
  233. XMLPUBFUN htmlDocPtr
  234. htmlCtxtReadDoc (xmlParserCtxtPtr ctxt,
  235. const xmlChar *cur,
  236. const char *URL,
  237. const char *encoding,
  238. int options);
  239. XMLPUBFUN htmlDocPtr
  240. htmlCtxtReadFile (xmlParserCtxtPtr ctxt,
  241. const char *filename,
  242. const char *encoding,
  243. int options);
  244. XMLPUBFUN htmlDocPtr
  245. htmlCtxtReadMemory (xmlParserCtxtPtr ctxt,
  246. const char *buffer,
  247. int size,
  248. const char *URL,
  249. const char *encoding,
  250. int options);
  251. XMLPUBFUN htmlDocPtr
  252. htmlCtxtReadFd (xmlParserCtxtPtr ctxt,
  253. int fd,
  254. const char *URL,
  255. const char *encoding,
  256. int options);
  257. XMLPUBFUN htmlDocPtr
  258. htmlCtxtReadIO (xmlParserCtxtPtr ctxt,
  259. xmlInputReadCallback ioread,
  260. xmlInputCloseCallback ioclose,
  261. void *ioctx,
  262. const char *URL,
  263. const char *encoding,
  264. int options);
  265. /* NRK/Jan2003: further knowledge of HTML structure
  266. */
  267. typedef enum {
  268. HTML_NA = 0 , /* something we don't check at all */
  269. HTML_INVALID = 0x1 ,
  270. HTML_DEPRECATED = 0x2 ,
  271. HTML_VALID = 0x4 ,
  272. HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
  273. } htmlStatus ;
  274. /* Using htmlElemDesc rather than name here, to emphasise the fact
  275. that otherwise there's a lookup overhead
  276. */
  277. XMLPUBFUN htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
  278. XMLPUBFUN int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
  279. XMLPUBFUN htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
  280. XMLPUBFUN htmlStatus htmlNodeStatus(htmlNodePtr, int) ;
  281. /**
  282. * htmlDefaultSubelement:
  283. * @elt: HTML element
  284. *
  285. * Returns the default subelement for this element
  286. */
  287. #define htmlDefaultSubelement(elt) elt->defaultsubelt
  288. /**
  289. * htmlElementAllowedHereDesc:
  290. * @parent: HTML parent element
  291. * @elt: HTML element
  292. *
  293. * Checks whether an HTML element description may be a
  294. * direct child of the specified element.
  295. *
  296. * Returns 1 if allowed; 0 otherwise.
  297. */
  298. #define htmlElementAllowedHereDesc(parent,elt) \
  299. htmlElementAllowedHere((parent), (elt)->name)
  300. /**
  301. * htmlRequiredAttrs:
  302. * @elt: HTML element
  303. *
  304. * Returns the attributes required for the specified element.
  305. */
  306. #define htmlRequiredAttrs(elt) (elt)->attrs_req
  307. #ifdef __cplusplus
  308. }
  309. #endif
  310. #endif /* LIBXML_HTML_ENABLED */
  311. #endif /* __HTML_PARSER_H__ */