Add support for the libxml2 library.

This adds the html-parse-string and xml-parse-string functions in the
new file src/xml.c, as well as autoconf detection of the library.
This commit is contained in:
Lars Magne Ingebrigtsen 2010-09-10 18:44:35 +02:00
parent 36f7d36669
commit 381408e219
10 changed files with 344 additions and 3 deletions

View file

@ -1,3 +1,7 @@
2010-09-10 Lars Magne Ingebrigtsen <larsi@gnus.org>
* configure.in: Check for libxml2.
2010-09-09 Glenn Morris <rgm@gnu.org>
* make-dist: No more TODO files under lisp/.

118
configure vendored
View file

@ -660,6 +660,8 @@ BLESSMAIL_TARGET
LIBS_MAIL
liblockfile
ALLOCA
LIBXML2_LIBS
LIBXML2_CFLAGS
LIBXSM
LIBGPM
LIBGIF
@ -807,6 +809,7 @@ with_tiff
with_gif
with_png
with_rsvg
with_xml2
with_imagemagick
with_xft
with_libotf
@ -1514,6 +1517,7 @@ Optional Packages:
--without-gif don't compile with GIF image support
--without-png don't compile with PNG image support
--without-rsvg don't compile with SVG image support
--without-xml2 don't compile with XML parsing support
--with-imagemagick compile with ImageMagick image support
--without-xft don't use XFT for anti aliased fonts
--without-libotf don't use libotf for OpenType font support
@ -2732,6 +2736,14 @@ else
fi
# Check whether --with-xml2 was given.
if test "${with_xml2+set}" = set; then :
withval=$with_xml2;
else
with_xml2=yes
fi
# Check whether --with-imagemagick was given.
if test "${with_imagemagick+set}" = set; then :
withval=$with_imagemagick;
@ -11070,6 +11082,112 @@ $as_echo "#define HAVE_X_SM 1" >>confdefs.h
fi
### Use libxml (-lxml2) if available
if test "${with_xml2}" != "no"; then
### I'm not sure what the version number should be, so I just guessed.
succeeded=no
# Extract the first word of "pkg-config", so it can be a program name with args.
set dummy pkg-config; ac_word=$2
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
$as_echo_n "checking for $ac_word... " >&6; }
if test "${ac_cv_path_PKG_CONFIG+set}" = set; then :
$as_echo_n "(cached) " >&6
else
case $PKG_CONFIG in
[\\/]* | ?:[\\/]*)
ac_cv_path_PKG_CONFIG="$PKG_CONFIG" # Let the user override the test with a path.
;;
*)
as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
for as_dir in $PATH
do
IFS=$as_save_IFS
test -z "$as_dir" && as_dir=.
for ac_exec_ext in '' $ac_executable_extensions; do
if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
ac_cv_path_PKG_CONFIG="$as_dir/$ac_word$ac_exec_ext"
$as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
break 2
fi
done
done
IFS=$as_save_IFS
test -z "$ac_cv_path_PKG_CONFIG" && ac_cv_path_PKG_CONFIG="no"
;;
esac
fi
PKG_CONFIG=$ac_cv_path_PKG_CONFIG
if test -n "$PKG_CONFIG"; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $PKG_CONFIG" >&5
$as_echo "$PKG_CONFIG" >&6; }
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
$as_echo "no" >&6; }
fi
if test "$PKG_CONFIG" = "no" ; then
HAVE_LIBXML2=no
else
PKG_CONFIG_MIN_VERSION=0.9.0
if $PKG_CONFIG --atleast-pkgconfig-version $PKG_CONFIG_MIN_VERSION; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for libxml-2.0 > 2.2.0" >&5
$as_echo_n "checking for libxml-2.0 > 2.2.0... " >&6; }
if $PKG_CONFIG --exists "libxml-2.0 > 2.2.0" 2>&5; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
$as_echo "yes" >&6; }
succeeded=yes
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking LIBXML2_CFLAGS" >&5
$as_echo_n "checking LIBXML2_CFLAGS... " >&6; }
LIBXML2_CFLAGS=`$PKG_CONFIG --cflags "libxml-2.0 > 2.2.0"|sed -e 's,///*,/,g'`
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $LIBXML2_CFLAGS" >&5
$as_echo "$LIBXML2_CFLAGS" >&6; }
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking LIBXML2_LIBS" >&5
$as_echo_n "checking LIBXML2_LIBS... " >&6; }
LIBXML2_LIBS=`$PKG_CONFIG --libs "libxml-2.0 > 2.2.0"|sed -e 's,///*,/,g'`
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $LIBXML2_LIBS" >&5
$as_echo "$LIBXML2_LIBS" >&6; }
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
$as_echo "no" >&6; }
LIBXML2_CFLAGS=""
LIBXML2_LIBS=""
## If we have a custom action on failure, don't print errors, but
## do set a variable so people can do so.
LIBXML2_PKG_ERRORS=`$PKG_CONFIG --errors-to-stdout --print-errors "libxml-2.0 > 2.2.0"`
fi
else
echo "*** Your version of pkg-config is too old. You need version $PKG_CONFIG_MIN_VERSION or newer."
echo "*** See http://www.freedesktop.org/software/pkgconfig"
fi
fi
if test $succeeded = yes; then
HAVE_LIBXML2=yes
else
HAVE_LIBXML2=no
fi
if test "${HAVE_LIBXML2}" = "yes"; then
$as_echo "#define HAVE_LIBXML2 1" >>confdefs.h
fi
fi
# If netdb.h doesn't declare h_errno, we must declare it by hand.
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether netdb declares h_errno" >&5
$as_echo_n "checking whether netdb declares h_errno... " >&6; }

View file

@ -155,6 +155,7 @@ OPTION_DEFAULT_ON([tiff],[don't compile with TIFF image support])
OPTION_DEFAULT_ON([gif],[don't compile with GIF image support])
OPTION_DEFAULT_ON([png],[don't compile with PNG image support])
OPTION_DEFAULT_ON([rsvg],[don't compile with SVG image support])
OPTION_DEFAULT_ON([xml2],[don't compile with XML parsing support])
OPTION_DEFAULT_OFF([imagemagick],[compile with ImageMagick image support])
OPTION_DEFAULT_ON([xft],[don't use XFT for anti aliased fonts])
@ -2535,6 +2536,17 @@ if test "${HAVE_X11}" = "yes"; then
fi
AC_SUBST(LIBXSM)
### Use libxml (-lxml2) if available
if test "${with_xml2}" != "no"; then
### I'm not sure what the version number should be, so I just guessed.
PKG_CHECK_MODULES(LIBXML2, libxml-2.0 > 2.2.0, HAVE_LIBXML2=yes, HAVE_LIBXML2=no)
if test "${HAVE_LIBXML2}" = "yes"; then
AC_DEFINE(HAVE_LIBXML2, 1, [Define to 1 if you have the libxml library (-lxml2).])
fi
fi
AC_SUBST(LIBXML2_LIBS)
AC_SUBST(LIBXML2_CFLAGS)
# If netdb.h doesn't declare h_errno, we must declare it by hand.
AC_CACHE_CHECK(whether netdb declares h_errno,
emacs_cv_netdb_declares_h_errno,

View file

@ -59,6 +59,7 @@ the character after point.
position stored in a register.
* Base 64:: Conversion to or from base 64 encoding.
* MD5 Checksum:: Compute the MD5 "message digest"/"checksum".
* Parsing HTML:: Parsing HTML and XML.
* Atomic Changes:: Installing several buffer changes "atomically".
* Change Hooks:: Supplying functions to be run when text is changed.
@end menu
@ -4106,6 +4107,49 @@ using the specified or chosen coding system. However, if
coding instead.
@end defun
@node Parsing HTML
@section Parsing HTML
@cindex parsing html
@cindex parsing xml
Emacs provides an interface to the @code{libxml2} library via two
functions: @code{html-parse-buffer} and @code{xml-parse-buffer}. The
HTML function will parse ``real world'' HTML and try to return a
sensible parse tree, while the XML function is somewhat stricter about
syntax.
They both take a two optional parameter. The first is a buffer, and
the second is a base URL to be used to expand relative URLs in the
document, if any.
Here's an example demonstrating the structure of the parsed data you
get out. Given this HTML document:
@example
<html><hEad></head><body width=101><div class=thing>Foo<div>Yes
@end example
You get this parse tree:
@example
(html
(head)
(body
(:width . "101")
(div
(:class . "thing")
(text . "Foo")
(div
(text . "Yes\n")))))
@end example
It's a simple tree structure, where the @code{car} for each node is
the name of the node, and the @code{cdr} is the value, or the list of
values.
Attributes are coded the same way as child nodes, but with @samp{:} as
the first character.
@node Atomic Changes
@section Atomic Change Groups
@cindex atomic changes

View file

@ -1,3 +1,13 @@
2010-09-09 Lars Magne Ingebrigtsen <larsi@gnus.org>
* xml.c (Fxml_parse_buffer): New function to parse XML files.
2010-09-08 Lars Magne Ingebrigtsen <larsi@gnus.org>
* xml.c: New file.
(Fhtml_parse_buffer): New function to interface to the libxml2
html parsing function.
2010-09-05 Juanma Barranquero <lekktu@gmail.com>
* biditype.h: Regenerate.

View file

@ -226,6 +226,8 @@ RSVG_CFLAGS= @RSVG_CFLAGS@
IMAGEMAGICK_LIBS= @IMAGEMAGICK_LIBS@
IMAGEMAGICK_CFLAGS= @IMAGEMAGICK_CFLAGS@
LIBXML2_LIBS = @LIBXML2_LIBS@
LIBXML2_CFLAGS = @LIBXML2_CFLAGS@
## widget.o if USE_X_TOOLKIT, otherwise empty.
WIDGET_OBJ=@WIDGET_OBJ@
@ -320,7 +322,8 @@ MKDEPDIR=@MKDEPDIR@
## FIXME? MYCPPFLAGS only referenced in etc/DEBUG.
ALL_CFLAGS=-Demacs -DHAVE_CONFIG_H $(MYCPPFLAGS) -I. -I${srcdir} \
${C_SWITCH_MACHINE} ${C_SWITCH_SYSTEM} ${C_SWITCH_X_SITE} \
${C_SWITCH_X_SYSTEM} ${CFLAGS_SOUND} ${RSVG_CFLAGS} ${IMAGEMAGICK_CFLAGS} ${DBUS_CFLAGS} \
${C_SWITCH_X_SYSTEM} ${CFLAGS_SOUND} ${RSVG_CFLAGS} ${IMAGEMAGICK_CFLAGS} \
${LIBXML2_CFLAGS} ${DBUS_CFLAGS} \
${GCONF_CFLAGS} ${FREETYPE_CFLAGS} ${FONTCONFIG_CFLAGS} \
${LIBOTF_CFLAGS} ${M17N_FLT_CFLAGS} ${DEPFLAGS} ${PROFILING_CFLAGS} \
${C_WARNINGS_SWITCH} ${CFLAGS}
@ -349,7 +352,7 @@ obj= dispnew.o frame.o scroll.o xdisp.o menu.o $(XMENU_OBJ) window.o \
syntax.o $(UNEXEC_OBJ) bytecode.o \
process.o callproc.o \
region-cache.o sound.o atimer.o \
doprnt.o strftime.o intervals.o textprop.o composite.o md5.o \
doprnt.o strftime.o intervals.o textprop.o composite.o md5.o xml.o \
$(MSDOS_OBJ) $(MSDOS_X_OBJ) $(NS_OBJ) $(CYGWIN_OBJ) $(FONT_OBJ)
## Object files used on some machine or other.
@ -595,7 +598,8 @@ SOME_MACHINE_LISP = ../lisp/mouse.elc \
## duplicated symbols. If the standard libraries were compiled
## with GCC, we might need LIB_GCC again after them.
LIBES = $(LIBS) $(LIBX_BASE) $(LIBX_OTHER) $(LIBSOUND) \
$(RSVG_LIBS) ${IMAGEMAGICK_LIBS} $(DBUS_LIBS) $(LIBGPM) $(LIBRESOLV) $(LIBS_SYSTEM) \
$(RSVG_LIBS) ${IMAGEMAGICK_LIBS} $(DBUS_LIBS) \
${LIBXML2_LIBS} $(LIBGPM) $(LIBRESOLV) $(LIBS_SYSTEM) \
$(LIBS_TERMCAP) $(GETLOADAVG_LIBS) ${GCONF_LIBS} ${LIBSELINUX_LIBS} \
$(FREETYPE_LIBS) $(FONTCONFIG_LIBS) $(LIBOTF_LIBS) $(M17N_FLT_LIBS) \
$(LIB_GCC) $(LIB_MATH) $(LIB_STANDARD) $(LIB_GCC)

View file

@ -813,6 +813,9 @@ along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
/* Define to 1 if you have the SM library (-lSM). */
#undef HAVE_X_SM
/* Define to 1 if you have the libxml2 library (-lxml2). */
#undef HAVE_LIBXML2
/* Define to 1 if you want to use the X window system. */
#undef HAVE_X_WINDOWS

View file

@ -1544,6 +1544,10 @@ main (int argc, char **argv)
#endif
#endif /* HAVE_X_WINDOWS */
#ifdef HAVE_LIBXML2
syms_of_xml ();
#endif
syms_of_menu ();
#ifdef HAVE_NTGUI

View file

@ -3577,6 +3577,11 @@ extern char *x_get_keysym_name (int);
EXFUN (Fmsdos_downcase_filename, 1);
#endif
#ifdef HAVE_LIBXML2
/* Defined in xml.c */
extern void syms_of_xml (void);
#endif
#ifdef HAVE_MENUS
/* Defined in (x|w32)fns.c, nsfns.m... */
extern int have_menus_p (void);

137
src/xml.c Normal file
View file

@ -0,0 +1,137 @@
/* Interface to libxml2.
Copyright (C) 2010 Free Software Foundation, Inc.
This file is part of GNU Emacs.
GNU Emacs is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
GNU Emacs is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
#include <config.h>
#ifdef HAVE_LIBXML2
#include <setjmp.h>
#include <libxml/tree.h>
#include <libxml/parser.h>
#include <libxml/HTMLparser.h>
#include "lisp.h"
#include "buffer.h"
Lisp_Object make_dom (xmlNode *node)
{
if (node->type == XML_ELEMENT_NODE) {
Lisp_Object result = Fcons (intern (node->name), Qnil);
xmlNode *child;
xmlAttr *property;
/* First add the attributes. */
property = node->properties;
while (property != NULL) {
if (property->children &&
property->children->content) {
char *pname = xmalloc (strlen (property->name) + 2);
*pname = ':';
strcpy(pname + 1, property->name);
result = Fcons (Fcons (intern (pname),
build_string(property->children->content)),
result);
xfree (pname);
}
property = property->next;
}
/* Then add the children of the node. */
child = node->children;
while (child != NULL) {
result = Fcons (make_dom (child), result);
child = child->next;
}
return Fnreverse (result);
} else if (node->type == XML_TEXT_NODE) {
Lisp_Object content = Qnil;
if (node->content)
content = build_string (node->content);
return Fcons (intern (node->name), content);
} else
return Qnil;
}
static Lisp_Object
parse_buffer (Lisp_Object string, Lisp_Object base_url, int htmlp)
{
xmlDoc *doc;
xmlNode *node;
Lisp_Object result;
int ibeg, iend;
char *burl = "";
LIBXML_TEST_VERSION;
CHECK_STRING (string);
if (! NILP (base_url)) {
CHECK_STRING (base_url);
burl = SDATA (base_url);
}
if (htmlp)
doc = htmlReadMemory (SDATA (string), SBYTES (string), burl, "utf-8",
HTML_PARSE_RECOVER|HTML_PARSE_NONET|
HTML_PARSE_NOWARNING|HTML_PARSE_NOERROR);
else
doc = xmlReadMemory (SDATA (string), SBYTES (string), burl, "utf-8",
XML_PARSE_NONET|XML_PARSE_NOWARNING|
XML_PARSE_NOERROR);
if (doc != NULL) {
node = xmlDocGetRootElement (doc);
if (node != NULL)
result = make_dom (node);
xmlFreeDoc (doc);
xmlCleanupParser ();
}
return result;
}
DEFUN ("html-parse-string", Fhtml_parse_string, Shtml_parse_string,
0, 2, 0,
doc: /* Parse the string as an HTML document and return the parse tree.*/)
(Lisp_Object string, Lisp_Object base_url)
{
return parse_buffer (string, base_url, 1);
}
DEFUN ("xml-parse-string", Fxml_parse_string, Sxml_parse_string,
0, 2, 0,
doc: /* Parse the string as an XML document and return the parse tree.*/)
(Lisp_Object string, Lisp_Object base_url)
{
return parse_buffer (string, base_url, 0);
}
/***********************************************************************
Initialization
***********************************************************************/
void
syms_of_xml (void)
{
defsubr (&Shtml_parse_string);
defsubr (&Sxml_parse_string);
}
#endif /* HAVE_LIBXML2 */