Vuo  2.3.2
VuoTextHtml.c
Go to the documentation of this file.
1 
10 #include "VuoTextHtml.h"
11 
12 #include <libxml/xpath.h>
13 #include <libxml/HTMLparser.h>
14 
15 #include "module.h"
16 
17 #ifdef VUO_COMPILER
19  "title" : "VuoTextHtml",
20  "dependencies" : [
21  "xml2",
22  "z",
23  "VuoText"
24  ]
25  });
26 #endif
27 
31 static void VuoXmlError(void *unused, xmlError *error)
32 {
33  char *message = strdup(error->message);
34  size_t len = strlen(message);
35  if (message[len-1] == '\n')
36  message[len-1] = 0;
37 
38  VUserLog("Error: %s (line %i)", message, error->line);
39 
40  free(message);
41 }
42 
46 static void __attribute__((constructor)) init()
47 {
48  xmlInitParser();
49  xmlSetStructuredErrorFunc(NULL, VuoXmlError);
50 }
51 
57 {
58  if (!text)
59  return NULL;
60 
61  // First check whether the text contains stuff that looks like HTML,
62  // so we don't waste time parsing it if it won't change anything.
63  size_t length = strlen(text);
64  bool found = false;
65  for (unsigned int i = 0; i < length; ++i)
66  if (text[i] == '<' || text[i] == '&')
67  {
68  found = true;
69  break;
70  }
71  if (!found)
72  return VuoText_make(text);
73 
74  int options = HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING;
75  xmlDocPtr doc = htmlReadDoc((const xmlChar *)text, "", "UTF-8", options);
76  if (!doc)
77  {
78  // Sometimes it works on the second try.
79  doc = htmlReadDoc((const xmlChar *)text, "", "UTF-8", options);
80  if (!doc)
81  return NULL;
82  }
83  VuoDefer(^{ xmlFreeDoc(doc); });
84 
85 
86  // Remove the <style> and <script> tags.
87  {
88  xmlXPathContextPtr xpathContext = xmlXPathNewContext(doc);
89  if (!xpathContext)
90  return NULL;
91  VuoDefer(^{ xmlXPathFreeContext(xpathContext); });
92 
93  xmlXPathObjectPtr xpathObject = xmlXPathEvalExpression((const unsigned char *)"//style|//script", xpathContext);
94  if (!xpathObject)
95  return NULL;
96  VuoDefer(^{ xmlXPathFreeObject(xpathObject); });
97 
98  if (!xmlXPathNodeSetIsEmpty(xpathObject->nodesetval))
99  {
100  for (int i = 0; i < xpathObject->nodesetval->nodeNr; ++i)
101  {
102  xmlUnlinkNode(xpathObject->nodesetval->nodeTab[i]);
103  xmlFree(xpathObject->nodesetval->nodeTab[i]);
104  xpathObject->nodesetval->nodeTab[i] = NULL;
105  }
106  }
107  }
108 
109 
110  xmlNodePtr root = xmlDocGetRootElement(doc);
111  xmlChar *content = xmlNodeGetContent(root);
112  VuoDefer(^{ xmlFree(content); });
113 
114  return VuoText_make((const char *)content);
115 }