From 9477b27154cfb8ad4fdc4b475435d0ecdad2fa5c Mon Sep 17 00:00:00 2001 From: blacklight Date: Wed, 18 Feb 2009 00:10:57 +0100 Subject: [PATCH] Adding sources --- src/Doxyfile | 1252 ++++++++++++++ src/Markup.cpp | 4127 +++++++++++++++++++++++++++++++++++++++++++++ src/layer.cpp | 102 ++ src/neural_doc.h | 194 +++ src/neuralnet.cpp | 578 +++++++ src/neuron.cpp | 98 ++ src/synapsis.cpp | 84 + 7 files changed, 6435 insertions(+) create mode 100644 src/Doxyfile create mode 100644 src/Markup.cpp create mode 100644 src/layer.cpp create mode 100644 src/neural_doc.h create mode 100644 src/neuralnet.cpp create mode 100644 src/neuron.cpp create mode 100644 src/synapsis.cpp diff --git a/src/Doxyfile b/src/Doxyfile new file mode 100644 index 0000000..c7b817c --- /dev/null +++ b/src/Doxyfile @@ -0,0 +1,1252 @@ +# Doxyfile 1.5.1 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project +# +# All text after a hash (#) is considered a comment and will be ignored +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" ") + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded +# by quotes) that should identify the project. + +PROJECT_NAME = Neural++ + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. +# This could be handy for archiving the generated documentation or +# if some version control system is used. + +PROJECT_NUMBER = 0.2.1 + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = ../doc + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create +# 4096 sub-directories (in 2 levels) under the output directory of each output +# format and will distribute the generated files over these directories. +# Enabling this option can be useful when feeding doxygen a huge amount of +# source files, where putting all generated files in the same directory would +# otherwise cause performance problems for the file system. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, +# Croatian, Czech, Danish, Dutch, Finnish, French, German, Greek, Hungarian, +# Italian, Japanese, Japanese-en (Japanese with English messages), Korean, +# Korean-en, Lithuanian, Norwegian, Polish, Portuguese, Romanian, Russian, +# Serbian, Slovak, Slovene, Spanish, Swedish, and Ukrainian. + +OUTPUT_LANGUAGE = English + +# This tag can be used to specify the encoding used in the generated output. +# The encoding is not always determined by the language that is chosen, +# but also whether or not the output is meant for Windows or non-Windows users. +# In case there is a difference, setting the USE_WINDOWS_ENCODING tag to YES +# forces the Windows encoding (this is the default for the Windows binary), +# whereas setting the tag to NO uses a Unix-style encoding (the default for +# all platforms other than Windows). + +USE_WINDOWS_ENCODING = NO + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to JavaDoc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator +# that is used to form the text in various listings. Each string +# in this list, if found as the leading text of the brief description, will be +# stripped from the text and the result after processing the whole list, is +# used as the annotated text. Otherwise, the brief description is used as-is. +# If left blank, the following values are used ("$name" is automatically +# replaced with the name of the entity): "The $name class" "The $name widget" +# "The $name file" "is" "provides" "specifies" "contains" +# "represents" "a" "an" "the" + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = YES + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user-defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the +# path to strip. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of +# the path mentioned in the documentation of a class, which tells +# the reader which header file to include in order to use a class. +# If left blank only the name of the header file containing the class +# definition is used. Otherwise one should specify the include paths that +# are normally passed to the compiler using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful is your file systems +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a JavaDoc-style +# comment as the brief description. If set to NO, the JavaDoc +# comments will behave just like the Qt-style comments (thus requiring an +# explicit @brief command for a brief description. + +JAVADOC_AUTOBRIEF = YES + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen +# treat a multi-line C++ special comment block (i.e. a block of //! or /// +# comments) as a brief description. This used to be the default behaviour. +# The new default is to treat a multi-line C++ comment block as a detailed +# description. Set this tag to YES if you prefer the old behaviour instead. + +MULTILINE_CPP_IS_BRIEF = YES + +# If the DETAILS_AT_TOP tag is set to YES then Doxygen +# will output the detailed description near the top, like JavaDoc. +# If set to NO, the detailed description appears after the member +# documentation. + +DETAILS_AT_TOP = NO + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# re-implements. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce +# a new page for each member. If set to NO, the documentation of a member will +# be part of the file/class/namespace that contains it. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 8 + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user-defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C +# sources only. Doxygen will then generate output that is more tailored for C. +# For instance, some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java +# sources only. Doxygen will then generate output that is more tailored for Java. +# For instance, namespaces will be presented as packages, qualified scopes +# will look different, etc. + +OPTIMIZE_OUTPUT_JAVA = NO + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want to +# include (a tag file for) the STL sources as input, then you should +# set this tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. +# func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. + +BUILTIN_STL_SUPPORT = NO + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES (the default) to allow class member groups of +# the same type (for instance a group of public functions) to be put as a +# subgroup of that type (e.g. under the Public Functions section). Set it to +# NO to prevent subgrouping. Alternatively, this can be done per class using +# the \nosubgrouping command. + +SUBGROUPING = YES + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = YES + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = YES + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. When set to YES local +# methods, which are defined in the implementation section but not in +# the interface are included in the documentation. +# If set to NO (the default) only methods in the interface are included. + +EXTRACT_LOCAL_METHODS = YES + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these classes will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all +# friend (class|struct|union) declarations. +# If set to NO (the default) these declarations will be included in the +# documentation. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any +# documentation blocks found inside the body of a function. +# If set to NO (the default) these blocks will be appended to the +# function's detailed documentation block. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put a list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = YES + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the +# brief documentation of file, namespace and class members alphabetically +# by member name. If set to NO (the default) the members will appear in +# declaration order. + +SORT_BRIEF_DOCS = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be +# sorted by fully-qualified names, including namespaces. If set to +# NO (the default), the class list will be sorted only by class name, +# not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the +# alphabetical list. + +SORT_BY_SCOPE_NAME = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or +# disable (NO) the deprecated list. This list is created by putting +# \deprecated commands in the documentation. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if sectionname ... \endif. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or define consists of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and defines in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = YES + +# If the sources in your project are distributed over multiple directories +# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy +# in the documentation. The default is NO. + +SHOW_DIRECTORIES = NO + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from the +# version control system). Doxygen will invoke the program by executing (via +# popen()) the command , where is the value of +# the FILE_VERSION_FILTER tag, and is the name of an input file +# provided by doxygen. Whatever the program writes to standard output +# is used as the file version. See the manual for examples. + +FILE_VERSION_FILTER = + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = YES + +# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some +# parameters in a documented function, or documenting parameters that +# don't exist or using markup commands wrongly. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be abled to get warnings for +# functions that are documented, but have no documentation for their parameters +# or return value. If set to NO (the default) doxygen will only warn about +# wrong or incomplete parameter documentation, but not about the absence of +# documentation. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. Optionally the format may contain +# $version, which will be replaced by the version of the file (if it could +# be obtained via FILE_VERSION_FILTER) + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +INPUT = + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx +# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py + +FILE_PATTERNS = + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used select whether or not files or +# directories that are symbolic links (a Unix filesystem feature) are excluded +# from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. Note that the wildcards are matched +# against the file with absolute path, so to exclude all test directories +# for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command , where +# is the value of the INPUT_FILTER tag, and is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. If FILTER_PATTERNS is specified, this tag will be +# ignored. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: +# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further +# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER +# is applied to all files. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse (i.e. when SOURCE_BROWSER is set to YES). + +FILTER_SOURCE_FILES = NO + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. +# Note: To get rid of all source code in the generated output, make sure also +# VERBATIM_HEADERS is set to NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C and C++ comments will always remain visible. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES (the default) +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = YES + +# If the REFERENCES_RELATION tag is set to YES (the default) +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = YES + +# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) +# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from +# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will +# link to the source code. Otherwise they will link to the documentstion. + +REFERENCES_LINK_SOURCE = YES + +# If the USE_HTAGS tag is set to YES then the references to source code +# will point to the HTML generated by the htags(1) tool instead of doxygen +# built-in source browser. The htags tool is part of GNU's global source +# tagging system (see http://www.gnu.org/software/global/global.html). You +# will need version 4.8.6 or higher. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = NO + +# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then +# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns +# in which this list will be split (can be a number in the range [1..20]) + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If the tag is left blank doxygen +# will generate a default style sheet. Note that doxygen will try to copy +# the style sheet file to the HTML output directory, so don't put your own +# stylesheet in the HTML output directory as well, or it will be erased! + +HTML_STYLESHEET = + +# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, +# files or namespaces will be aligned in HTML using tables. If set to +# NO a bullet list will be used. + +HTML_ALIGN_MEMBERS = YES + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compressed HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can +# be used to specify the file name of the resulting .chm file. You +# can add a path in front of the file if the result should not be +# written to the html output directory. + +CHM_FILE = + +# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can +# be used to specify the location (absolute path including file name) of +# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run +# the HTML help compiler on the generated index.hhp. + +HHC_LOCATION = + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the master .chm file (NO). + +GENERATE_CHI = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the HTML help documentation and to the tree view. + +TOC_EXPAND = NO + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index at +# top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. + +DISABLE_INDEX = NO + +# This tag can be used to set the number of enum values (range [1..20]) +# that doxygen will group on one line in the generated HTML documentation. + +ENUM_VALUES_PER_LINE = 4 + +# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be +# generated containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# JavaScript, DHTML, CSS and frames is required (for instance Mozilla 1.0+, +# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are +# probably better off using the HTML help feature. + +GENERATE_TREEVIEW = NO + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = YES + +# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = latex + +# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be +# invoked. If left blank `latex' will be used as the default command name. + +LATEX_CMD_NAME = latex + +# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to +# generate index for LaTeX. If left blank `makeindex' will be used as the +# default command name. + +MAKEINDEX_CMD_NAME = makeindex + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# LaTeX documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = NO + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, a4wide, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = a4wide + +# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX +# packages that should be included in the LaTeX output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal LaTeX header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = + +# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = NO + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = NO + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated LaTeX files. This will instruct LaTeX to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +# If LATEX_HIDE_INDICES is set to YES then doxygen will not +# include the index chapters (such as File Index, Compound Index, etc.) +# in the output. + +LATEX_HIDE_INDICES = NO + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimized for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = rtf + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = NO + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = man + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = .3 + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = NO + +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- + +# If the GENERATE_XML tag is set to YES Doxygen will +# generate an XML file that captures the structure of +# the code including all documentation. + +GENERATE_XML = NO + +# The XML_OUTPUT tag is used to specify where the XML pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `xml' will be used as the default path. + +XML_OUTPUT = xml + +# The XML_SCHEMA tag can be used to specify an XML schema, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_SCHEMA = + +# The XML_DTD tag can be used to specify an XML DTD, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_DTD = + +# If the XML_PROGRAMLISTING tag is set to YES Doxygen will +# dump the program listings (including syntax highlighting +# and cross-referencing information) to the XML output. Note that +# enabling this will significantly increase the size of the XML output. + +XML_PROGRAMLISTING = YES + +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an AutoGen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- + +# If the GENERATE_PERLMOD tag is set to YES Doxygen will +# generate a Perl module file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_PERLMOD = NO + +# If the PERLMOD_LATEX tag is set to YES Doxygen will generate +# the necessary Makefile rules, Perl scripts and LaTeX code to be able +# to generate PDF and DVI output from the Perl module output. + +PERLMOD_LATEX = NO + +# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be +# nicely formatted so it can be parsed by a human reader. This is useful +# if you want to understand what is going on. On the other hand, if this +# tag is set to NO the size of the Perl module output will be much smaller +# and Perl will parse it just the same. + +PERLMOD_PRETTY = YES + +# The names of the make variables in the generated doxyrules.make file +# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. +# This is useful so different doxyrules.make files included by the same +# Makefile don't overwrite each other's variables. + +PERLMOD_MAKEVAR_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = NO + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_DEFINED tags. + +EXPAND_ONLY_PREDEF = NO + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# in the INCLUDE_PATH (see below) will be search if a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. To prevent a macro definition from being +# undefined via #undef or recursively expanded use the := operator +# instead of the = operator. + +PREDEFINED = + +# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition. + +EXPAND_AS_DEFINED = + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all function-like macros that are alone +# on a line, have an all uppercase name, and do not end with a semicolon. Such +# function macros are typically used for boiler-plate code, and will confuse +# the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES option can be used to specify one or more tagfiles. +# Optionally an initial location of the external documentation +# can be added for each tagfile. The format of a tag file without +# this location is as follows: +# TAGFILES = file1 file2 ... +# Adding location for the tag files is done as follows: +# TAGFILES = file1=loc1 "file2 = loc2" ... +# where "loc1" and "loc2" can be relative or absolute paths or +# URLs. If a location is present for each tag, the installdox tool +# does not have to be run to correct the links. +# Note that each tag file must have a unique name +# (where the name does NOT include the path) +# If a tag file is not located in the directory in which doxygen +# is run, you must also specify the path to the tagfile here. + +TAGFILES = + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = NO + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = YES + +# The PERL_PATH should be the absolute path and name of the perl script +# interpreter (i.e. the result of `which perl'). + +PERL_PATH = /usr/bin/perl + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will +# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base +# or super classes. Setting the tag to NO turns the diagrams off. Note that +# this option is superseded by the HAVE_DOT option below. This is only a +# fallback. It is recommended to install and use dot, since it yields more +# powerful graphs. + +CLASS_DIAGRAMS = YES + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = YES + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = NO + +# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect inheritance relations. Setting this tag to YES will force the +# the CLASS_DIAGRAMS tag to NO. + +CLASS_GRAPH = YES + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = YES + +# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for groups, showing the direct groups dependencies + +GROUP_GRAPHS = YES + +# If the UML_LOOK tag is set to YES doxygen will generate inheritance and +# collaboration diagrams in a style similar to the OMG's Unified Modeling +# Language. + +UML_LOOK = NO + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = NO + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = YES + +# If the CALL_GRAPH and HAVE_DOT tags are set to YES then doxygen will +# generate a call dependency graph for every global function or class method. +# Note that enabling this option will significantly increase the time of a run. +# So in most cases it will be better to enable call graphs for selected +# functions only using the \callgraph command. + +CALL_GRAPH = NO + +# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then doxygen will +# generate a caller dependency graph for every global function or class method. +# Note that enabling this option will significantly increase the time of a run. +# So in most cases it will be better to enable caller graphs for selected +# functions only using the \callergraph command. + +CALLER_GRAPH = NO + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES +# then doxygen will show the dependencies a directory has on other directories +# in a graphical way. The dependency relations are determined by the #include +# relations between the files in the directories. + +DIRECTORY_GRAPH = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are png, jpg, or gif +# If left blank png will be used. + +DOT_IMAGE_FORMAT = png + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found in the path. + +DOT_PATH = + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The MAX_DOT_GRAPH_WIDTH tag can be used to set the maximum allowed width +# (in pixels) of the graphs generated by dot. If a graph becomes larger than +# this value, doxygen will try to truncate the graph, so that it fits within +# the specified constraint. Beware that most browsers cannot cope with very +# large images. + +MAX_DOT_GRAPH_WIDTH = 1024 + +# The MAX_DOT_GRAPH_HEIGHT tag can be used to set the maximum allows height +# (in pixels) of the graphs generated by dot. If a graph becomes larger than +# this value, doxygen will try to truncate the graph, so that it fits within +# the specified constraint. Beware that most browsers cannot cope with very +# large images. + +MAX_DOT_GRAPH_HEIGHT = 1024 + +# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the +# graphs generated by dot. A depth value of 3 means that only nodes reachable +# from the root by following a path via at most 3 edges will be shown. Nodes +# that lay further from the root node will be omitted. Note that setting this +# option to 1 or 2 may greatly reduce the computation time needed for large +# code bases. Also note that a graph may be further truncated if the graph's +# image dimensions are not sufficient to fit the graph (see MAX_DOT_GRAPH_WIDTH +# and MAX_DOT_GRAPH_HEIGHT). If 0 is used for the depth value (the default), +# the graph is not depth-constrained. + +MAX_DOT_GRAPH_DEPTH = 0 + +# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent +# background. This is disabled by default, which results in a white background. +# Warning: Depending on the platform used, enabling this option may lead to +# badly anti-aliased labels on the edges of a graph (i.e. they become hard to +# read). + +DOT_TRANSPARENT = NO + +# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output +# files in one run (i.e. multiple -o and -T options on the command line). This +# makes dot run faster, but since only newer versions of dot (>1.8.10) +# support this, this feature is disabled by default. + +DOT_MULTI_TARGETS = NO + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermediate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to the search engine +#--------------------------------------------------------------------------- + +# The SEARCHENGINE tag specifies whether or not a search engine should be +# used. If set to NO the values of all tags below this one will be ignored. + +SEARCHENGINE = NO diff --git a/src/Markup.cpp b/src/Markup.cpp new file mode 100644 index 0000000..03ec7fa --- /dev/null +++ b/src/Markup.cpp @@ -0,0 +1,4127 @@ +// Markup.cpp: implementation of the CMarkup class. +// +// Markup Release 10.1 +// Copyright (C) 2008 First Objective Software, Inc. All rights reserved +// Go to www.firstobject.com for the latest CMarkup and EDOM documentation +// Use in commercial applications requires written permission +// This software is provided "as is", with no warranty. +// +#include +#include "Markup.h" + +#if defined(MCD_STRERROR) // C error routine +#include +#endif // C error routine + +#if defined (MARKUP_ICONV) +#include +#endif + +#if defined(MARKUP_STL) && ( defined(MARKUP_WINCONV) || (! defined(MCD_STRERROR))) +#include // for MultiByteToWideChar, WideCharToMultiByte, FormatMessage +#endif // need windows.h when STL and (not setlocale or not strerror), MFC afx.h includes it already + +#if defined(MARKUP_MBCS) // MBCS/double byte +#pragma message( "Note: MBCS build (not UTF-8)" ) +// For UTF-8, remove MBCS from project settings C/C++ preprocessor definitions +#if defined (MARKUP_WINCONV) +#include // for VC++ _mbclen +#endif // WINCONV +#endif // MBCS/double byte + +#if defined(_DEBUG) && _MSC_VER > 1000 // VC++ DEBUG +#undef THIS_FILE +static char THIS_FILE[]=__FILE__; +#if defined(DEBUG_NEW) +#define new DEBUG_NEW +#endif // DEBUG_NEW +#endif // VC++ DEBUG + +// Customization +#define x_EOL MCD_T("\r\n") // can be \r\n or \n or empty +#define x_EOLLEN (sizeof(x_EOL)/sizeof(MCD_CHAR)-1) // string length of x_EOL +#define x_ATTRIBQUOTE MCD_T("\"") // can be double or single quote + + +// Disable "while ( 1 )" warning in VC++ 2002 +#if _MSC_VER >= 1300 // VC++ 2002 (7.0) +#pragma warning(disable:4127) +#endif // VC++ 2002 (7.0) + +void CMarkup::operator=( const CMarkup& markup ) +{ + m_iPosParent = markup.m_iPosParent; + m_iPos = markup.m_iPos; + m_iPosChild = markup.m_iPosChild; + m_iPosFree = markup.m_iPosFree; + m_iPosDeleted = markup.m_iPosDeleted; + m_nNodeType = markup.m_nNodeType; + m_nNodeOffset = markup.m_nNodeOffset; + m_nNodeLength = markup.m_nNodeLength; + m_strDoc = markup.m_strDoc; + m_strError = markup.m_strError; + m_nDocFlags = markup.m_nDocFlags; + + // Copy used part of the index array + m_aPos.RemoveAll(); + m_aPos.nSize = m_iPosFree; + if ( m_aPos.nSize < 8 ) + m_aPos.nSize = 8; + m_aPos.nSegs = m_aPos.SegsUsed(); + if ( m_aPos.nSegs ) + { + m_aPos.pSegs = (ElemPos**)(new char[m_aPos.nSegs*sizeof(char*)]); + int nSegSize = 1 << m_aPos.PA_SEGBITS; + for ( int nSeg=0; nSeg < m_aPos.nSegs; ++nSeg ) + { + if ( nSeg + 1 == m_aPos.nSegs ) + nSegSize = m_aPos.GetSize() - (nSeg << m_aPos.PA_SEGBITS); + m_aPos.pSegs[nSeg] = (ElemPos*)(new char[nSegSize*sizeof(ElemPos)]); + memcpy( m_aPos.pSegs[nSeg], markup.m_aPos.pSegs[nSeg], nSegSize*sizeof(ElemPos) ); + } + } + + // Copy SavedPos map(s) + m_SavedPosMapArray.RemoveAll(); + if ( markup.m_SavedPosMapArray.pMaps ) + { + int nMap = 0; + SavedPosMap* pMap = NULL; + while ( markup.m_SavedPosMapArray.pMaps[nMap] ) + { + SavedPosMap* pMapSrc = markup.m_SavedPosMapArray.pMaps[nMap]; + x_GetMap( pMap, nMap, pMapSrc->nMapSize ); + for ( int nSlot=0; nSlot < pMap->nMapSize; ++nSlot ) + { + SavedPos* pCopySavedPos = pMapSrc->pTable[nSlot]; + if ( pCopySavedPos ) + { + int nCount = 0; + while ( pCopySavedPos[nCount].nSavedPosFlags & SavedPos::SPM_USED ) + { + ++nCount; + if ( pCopySavedPos[nCount-1].nSavedPosFlags & SavedPos::SPM_LAST ) + break; + } + if ( nCount ) + { + SavedPos* pNewSavedPos = new SavedPos[nCount]; + for ( int nCopy=0; nCopypTable[nSlot] = pNewSavedPos; + } + } + } + ++nMap; + } + } + + MARKUP_SETDEBUGSTATE; +} + +bool CMarkup::SetDoc( MCD_PCSZ pDoc ) +{ + // Set document text + if ( pDoc ) + m_strDoc = pDoc; + else + MCD_STRCLEAR(m_strDoc); + + MCD_STRCLEAR(m_strError); + return x_ParseDoc(); +}; + +bool CMarkup::SetDoc( const MCD_STR& strDoc ) +{ + m_strDoc = strDoc; + MCD_STRCLEAR(m_strError); + return x_ParseDoc(); +} + +bool CMarkup::IsWellFormed() +{ + if ( m_aPos.GetSize() + && ! (m_aPos[0].nFlags & MNF_ILLFORMED) + && m_aPos[0].iElemChild + && ! m_aPos[m_aPos[0].iElemChild].iElemNext ) + return true; + return false; +} + +bool CMarkup::Load( MCD_CSTR_FILENAME szFileName ) +{ + if ( ! ReadTextFile(szFileName, m_strDoc, &m_strError, &m_nDocFlags) ) + return false; + return x_ParseDoc(); +} + +bool CMarkup::ReadTextFile( MCD_CSTR_FILENAME szFileName, MCD_STR& strDoc, MCD_STR* pstrError, int* pnDocFlags, MCD_STR* pstrEncoding ) +{ + // Static utility method to load text file into strDoc + // + FilePos file; + file.nDocFlags = (pnDocFlags?*pnDocFlags:0) | MDF_READFILE; + bool bSuccess = x_Open( szFileName, file ); + MCD_STR strCombinedIOResult = file.strIOResult; + MCD_STRCLEAR(strDoc); + if ( bSuccess ) + { + if ( pstrEncoding ) + { + MCD_STR strEncoding = *pstrEncoding; + if ( file.nFileCharUnitSize == 1 && ! MCD_STRISEMPTY(strEncoding) && file.strEncoding != strEncoding ) + { + file.strEncoding = strEncoding; // override + strCombinedIOResult += MCD_T("forced "); + strCombinedIOResult += file.strEncoding + MCD_T(" "); + } + } + file.nReadByteLen = file.nFileByteLen; + bSuccess = x_ReadText( strDoc, file ); + x_Close( file ); + if ( MCD_STRISEMPTY(strCombinedIOResult) ) + { + if ( MCD_STRISEMPTY(file.strEncoding) ) + strCombinedIOResult = MCD_T("ANSI "); + else + strCombinedIOResult = file.strEncoding + MCD_T(" "); + } + strCombinedIOResult += file.strIOResult; + if ( pnDocFlags ) + *pnDocFlags = file.nDocFlags; + } + if ( pstrError ) + *pstrError = strCombinedIOResult; + return bSuccess; +} + +bool CMarkup::Save( MCD_CSTR_FILENAME szFileName ) +{ + return WriteTextFile( szFileName, m_strDoc, &m_strError, &m_nDocFlags ); +} + +bool CMarkup::WriteTextFile( MCD_CSTR_FILENAME szFileName, const MCD_STR& strDoc, MCD_STR* pstrError, int* pnDocFlags, MCD_STR* pstrEncoding ) +{ + // Static utility method to save strDoc to text file + // + FilePos file; + file.nDocFlags = (pnDocFlags?*pnDocFlags:0) | MDF_WRITEFILE; + bool bSuccess = x_Open( szFileName, file ); + MCD_STR strCombinedIOResult = file.strIOResult; + if ( bSuccess ) + { + if ( MCD_STRISEMPTY(file.strEncoding) && ! MCD_STRISEMPTY(strDoc) ) + { + file.strEncoding = GetDeclaredEncoding( strDoc ); + if ( MCD_STRISEMPTY(file.strEncoding) && strDoc[0] == '<' ) + file.strEncoding = MCD_T("UTF-8"); + if ( ! MCD_STRISEMPTY(file.strEncoding) ) + strCombinedIOResult += file.strEncoding + MCD_T(" "); + } + if ( pstrEncoding ) + { + MCD_STR strEncoding = *pstrEncoding; + if ( file.nFileCharUnitSize == 1 && ! MCD_STRISEMPTY(strEncoding) && file.strEncoding != strEncoding ) + { + file.strEncoding = strEncoding; // override + strCombinedIOResult += MCD_T("forced "); + strCombinedIOResult += file.strEncoding + MCD_T(" "); + } + } + bSuccess = x_WriteText( strDoc, file ); + x_Close( file ); + strCombinedIOResult += file.strIOResult; + if ( pnDocFlags ) + *pnDocFlags = file.nDocFlags; + } + if ( pstrError ) + *pstrError = strCombinedIOResult; + return bSuccess; +} + +bool CMarkup::FindElem( MCD_CSTR szName ) +{ + // Change current position only if found + // + if ( m_aPos.GetSize() ) + { + int iPos = x_FindElem( m_iPosParent, m_iPos, szName ); + if ( iPos ) + { + // Assign new position + x_SetPos( m_aPos[iPos].iElemParent, iPos, 0 ); + return true; + } + } + return false; +} + +bool CMarkup::FindChildElem( MCD_CSTR szName ) +{ + // Change current child position only if found + // + // Shorthand: call this with no current main position + // means find child under root element + if ( ! m_iPos ) + FindElem(); + + int iPosChild = x_FindElem( m_iPos, m_iPosChild, szName ); + if ( iPosChild ) + { + // Assign new position + int iPos = m_aPos[iPosChild].iElemParent; + x_SetPos( m_aPos[iPos].iElemParent, iPos, iPosChild ); + return true; + } + + return false; +} + +MCD_STR CMarkup::EscapeText( MCD_CSTR szText, int nFlags ) +{ + // Convert text as seen outside XML document to XML friendly + // replacing special characters with ampersand escape codes + // E.g. convert "6>7" to "6>7" + // + // < less than + // & ampersand + // > greater than + // + // and for attributes: + // + // ' apostrophe or single quote + // " double quote + // + static MCD_PCSZ apReplace[] = { MCD_T("<"),MCD_T("&"),MCD_T(">"),MCD_T("'"),MCD_T(""") }; + MCD_PCSZ pFind = (nFlags&MNF_ESCAPEQUOTES)?MCD_T("<&>\'\""):MCD_T("<&>"); + MCD_STR strText; + MCD_PCSZ pSource = szText; + int nDestSize = MCD_PSZLEN(pSource); + nDestSize += nDestSize / 10 + 7; + MCD_BLDRESERVE(strText,nDestSize); + MCD_CHAR cSource = *pSource; + MCD_PCSZ pFound; + int nCharLen; + while ( cSource ) + { + MCD_BLDCHECK(strText,nDestSize,6); + if ( (pFound=MCD_PSZCHR(pFind,cSource)) != NULL ) + { + bool bIgnoreAmpersand = false; + if ( (nFlags&MNF_WITHREFS) && *pFound == '&' ) + { + // Do not replace ampersand if it is start of any entity reference + // &[#_:A-Za-zU][_:-.A-Za-z0-9U]*; where U is > 0x7f + MCD_PCSZ pCheckEntity = pSource; + ++pCheckEntity; + MCD_CHAR c = *pCheckEntity; + if ( (c>='A'&&c<='Z') || (c>='a'&&c<='z') + || c=='#' || c=='_' || c==':' || ((unsigned int)c)>0x7f ) + { + while ( 1 ) + { + pCheckEntity += MCD_CLEN( pCheckEntity ); + c = *pCheckEntity; + if ( c == ';' ) + { + int nEntityLen = (int)(pCheckEntity - pSource) + 1; + MCD_BLDAPPENDN(strText,pSource,nEntityLen); + pSource = pCheckEntity; + bIgnoreAmpersand = true; + } + else if ( (c>='A'&&c<='Z') || (c>='a'&&c<='z') || (c>='0'&&c<='9') + || c=='_' || c==':' || c=='-' || c=='.' || ((unsigned int)c)>0x7f ) + continue; + break; + } + } + } + if ( ! bIgnoreAmpersand ) + { + pFound = apReplace[pFound-pFind]; + MCD_BLDAPPEND(strText,pFound); + } + ++pSource; // ASCII, so 1 byte + } + else + { + nCharLen = MCD_CLEN( pSource ); + MCD_BLDAPPENDN(strText,pSource,nCharLen); + pSource += nCharLen; + } + cSource = *pSource; + } + + MCD_BLDRELEASE(strText); + return strText; +} + +// Predefined character entities +// By default UnescapeText will decode standard HTML entities as well as the 5 in XML +// To unescape only the 5 standard XML entities, use this short table instead: +// MCD_PCSZ PredefEntityTable[4] = +// { MCD_T("20060lt"),MCD_T("40034quot"),MCD_T("30038amp"),MCD_T("20062gt40039apos") }; +// +// This is a precompiled ASCII hash table for speed and minimum memory requirement +// Each entry consists of a 1 digit code name length, 4 digit code point, and the code name +// Each table slot can have multiple entries, table size 130 was chosen for even distribution +// +MCD_PCSZ PredefEntityTable[130] = +{ + MCD_T("60216oslash60217ugrave60248oslash60249ugrave"), + MCD_T("50937omega60221yacute58968lceil50969omega60253yacute"), + MCD_T("50916delta50206icirc50948delta50238icirc68472weierp"),MCD_T("40185sup1"), + MCD_T("68970lfloor40178sup2"), + MCD_T("50922kappa60164curren50954kappa58212mdash40179sup3"), + MCD_T("59830diams58211ndash"),MCD_T("68855otimes58969rceil"), + MCD_T("50338oelig50212ocirc50244ocirc50339oelig58482trade"), + MCD_T("50197aring50931sigma50229aring50963sigma"), + MCD_T("50180acute68971rfloor50732tilde"),MCD_T("68249lsaquo"), + MCD_T("58734infin68201thinsp"),MCD_T("50161iexcl"), + MCD_T("50920theta50219ucirc50952theta50251ucirc"),MCD_T("58254oline"), + MCD_T("58260frasl68727lowast"),MCD_T("59827clubs60191iquest68250rsaquo"), + MCD_T("58629crarr50181micro"),MCD_T("58222bdquo"),MCD_T(""), + MCD_T("58243prime60177plusmn58242prime"),MCD_T("40914beta40946beta"),MCD_T(""), + MCD_T(""),MCD_T(""),MCD_T("50171laquo50215times"),MCD_T("40710circ"), + MCD_T("49001lang"),MCD_T("58220ldquo40175macr"), + MCD_T("40182para50163pound48476real"),MCD_T(""),MCD_T("58713notin50187raquo"), + MCD_T("48773cong50223szlig50978upsih"), + MCD_T("58776asymp58801equiv49002rang58218sbquo"), + MCD_T("50222thorn48659darr48595darr40402fnof58221rdquo50254thorn"), + MCD_T("40162cent58722minus"),MCD_T("58707exist40170ordf"),MCD_T(""), + MCD_T("40921iota58709empty48660harr48596harr40953iota"),MCD_T(""), + MCD_T("40196auml40228auml48226bull40167sect48838sube"),MCD_T(""), + MCD_T("48656larr48592larr58853oplus"),MCD_T("30176deg58216lsquo40186ordm"), + MCD_T("40203euml40039apos40235euml48712isin40160nbsp"), + MCD_T("40918zeta40950zeta"),MCD_T("38743and48195emsp48719prod"), + MCD_T("30935chi38745cap30967chi48194ensp"), + MCD_T("40207iuml40239iuml48706part48869perp48658rarr48594rarr"), + MCD_T("38736ang48836nsub58217rsquo"),MCD_T(""), + MCD_T("48901sdot48657uarr48593uarr"),MCD_T("40169copy48364euro"), + MCD_T("30919eta30951eta"),MCD_T("40214ouml40246ouml48839supe"),MCD_T(""), + MCD_T(""),MCD_T("30038amp30174reg"),MCD_T("48733prop"),MCD_T(""), + MCD_T("30208eth30934phi40220uuml30240eth30966phi40252uuml"),MCD_T(""),MCD_T(""), + MCD_T(""),MCD_T("40376yuml40255yuml"),MCD_T(""),MCD_T("40034quot48204zwnj"), + MCD_T("38746cup68756there4"),MCD_T("30929rho30961rho38764sim"), + MCD_T("30932tau38834sub30964tau"),MCD_T("38747int38206lrm38207rlm"), + MCD_T("30936psi30968psi30165yen"),MCD_T(""),MCD_T("28805ge30168uml"), + MCD_T("30982piv"),MCD_T(""),MCD_T("30172not"),MCD_T(""),MCD_T("28804le"), + MCD_T("30173shy"),MCD_T("39674loz28800ne38721sum"),MCD_T(""),MCD_T(""), + MCD_T("38835sup"),MCD_T("28715ni"),MCD_T(""),MCD_T("20928pi20960pi38205zwj"), + MCD_T(""),MCD_T("60923lambda20062gt60955lambda"),MCD_T(""),MCD_T(""), + MCD_T("60199ccedil60231ccedil"),MCD_T(""),MCD_T("20060lt"), + MCD_T("20926xi28744or20958xi"),MCD_T("20924mu20956mu"),MCD_T("20925nu20957nu"), + MCD_T("68225dagger68224dagger"),MCD_T("80977thetasym"),MCD_T(""),MCD_T(""), + MCD_T(""),MCD_T("78501alefsym"),MCD_T(""),MCD_T(""),MCD_T(""), + MCD_T("60193aacute60195atilde60225aacute60227atilde"),MCD_T(""), + MCD_T("70927omicron60247divide70959omicron"),MCD_T("60192agrave60224agrave"), + MCD_T("60201eacute60233eacute60962sigmaf"),MCD_T("70917epsilon70949epsilon"), + MCD_T(""),MCD_T("60200egrave60232egrave"),MCD_T("60205iacute60237iacute"), + MCD_T(""),MCD_T(""),MCD_T("60204igrave68230hellip60236igrave"), + MCD_T("60166brvbar"), + MCD_T("60209ntilde68704forall58711nabla60241ntilde69824spades"), + MCD_T("60211oacute60213otilde60189frac1260183middot60243oacute60245otilde"), + MCD_T(""),MCD_T("50184cedil60188frac14"), + MCD_T("50198aelig50194acirc60210ograve50226acirc50230aelig60242ograve"), + MCD_T("50915gamma60190frac3450947gamma58465image58730radic"), + MCD_T("60352scaron60353scaron"),MCD_T("60218uacute69829hearts60250uacute"), + MCD_T("50913alpha50202ecirc70933upsilon50945alpha50234ecirc70965upsilon"), + MCD_T("68240permil") +}; + +MCD_STR CMarkup::UnescapeText( MCD_CSTR szText, int nTextLength /*=-1*/ ) +{ + // Convert XML friendly text to text as seen outside XML document + // ampersand escape codes replaced with special characters e.g. convert "6>7" to "6>7" + // ampersand numeric codes replaced with character e.g. convert < to < + // Conveniently the result is always the same or shorter in byte length + // + MCD_STR strText; + MCD_PCSZ pSource = szText; + if ( nTextLength == -1 ) + nTextLength = MCD_PSZLEN(szText); + MCD_BLDRESERVE(strText,nTextLength); + MCD_CHAR szCodeName[10]; + int nCharLen; + int nChar = 0; + while ( nChar < nTextLength ) + { + if ( pSource[nChar] == '&' ) + { + // Get corresponding unicode code point + int nUnicode = 0; + + // Look for terminating semi-colon within 9 ASCII characters + int nCodeLen = 0; + MCD_CHAR cCodeChar = pSource[nChar+1]; + while ( nCodeLen < 9 && ((unsigned int)cCodeChar) < 128 && cCodeChar != ';' ) + { + if ( cCodeChar >= 'A' && cCodeChar <= 'Z') // upper case? + cCodeChar += ('a' - 'A'); // make lower case + szCodeName[nCodeLen] = cCodeChar; + ++nCodeLen; + cCodeChar = pSource[nChar+1+nCodeLen]; + } + if ( cCodeChar == ';' ) // found semi-colon? + { + // Decode szCodeName + szCodeName[nCodeLen] = '\0'; + if ( *szCodeName == '#' ) // numeric character reference? + { + // Is it a hex number? + int nBase = 10; // decimal + int nNumberOffset = 1; // after # + if ( szCodeName[1] == 'x' ) + { + nNumberOffset = 2; // after #x + nBase = 16; // hex + } + nUnicode = MCD_PSZTOL( &szCodeName[nNumberOffset], NULL, nBase ); + } + else // does not start with # + { + // Look for matching code name in PredefEntityTable + MCD_PCSZ pEntry = PredefEntityTable[x_Hash(szCodeName,sizeof(PredefEntityTable)/sizeof(MCD_PCSZ))]; + while ( *pEntry ) + { + // e.g. entry: 40039apos means length 4, code point 0039, code name apos + int nEntryLen = (*pEntry - '0'); + ++pEntry; + MCD_PCSZ pCodePoint = pEntry; + pEntry += 4; + if ( nEntryLen == nCodeLen && MCD_PSZNCMP(szCodeName,pEntry,nEntryLen) == 0 ) + { + // Convert digits to integer up to code name which always starts with alpha + nUnicode = MCD_PSZTOL( pCodePoint, NULL, 10 ); + break; + } + pEntry += nEntryLen; + } + } + } + + // If a code point found, encode it into text + if ( nUnicode ) + { + MCD_CHAR szChar[5]; + nCharLen = 0; +#if defined(MARKUP_WCHAR) // WCHAR +#if MARKUP_SIZEOFWCHAR == 4 // sizeof(wchar_t) == 4 + szChar[0] = (MCD_CHAR)nUnicode; + nCharLen = 1; +#else // sizeof(wchar_t) == 2 + EncodeCharUTF16( nUnicode, (unsigned short*)szChar, nCharLen ); +#endif +#elif defined(MARKUP_MBCS) // MBCS/double byte +#if defined(MARKUP_WINCONV) + int nUsedDefaultChar = 0; + wchar_t wszUTF16[2]; + EncodeCharUTF16( nUnicode, (unsigned short*)wszUTF16, nCharLen ); + nCharLen = WideCharToMultiByte( CP_ACP, 0, wszUTF16, nCharLen, szChar, 5, NULL, &nUsedDefaultChar ); + if ( nUsedDefaultChar || nCharLen <= 0 ) + nUnicode = 0; +#else // not WINCONV + wchar_t wcUnicode = (wchar_t)nUnicode; + nCharLen = wctomb( szChar, wcUnicode ); + if ( nCharLen <= 0 ) + nUnicode = 0; +#endif // not WINCONV +#else // not WCHAR and not MBCS/double byte + EncodeCharUTF8( nUnicode, szChar, nCharLen ); +#endif // not WCHAR and not MBCS/double byte + // Increment index past ampersand semi-colon + if ( nUnicode ) // must check since MBCS case can clear it + { + MCD_BLDAPPENDN(strText,szChar,nCharLen); + nChar += nCodeLen + 2; + } + } + if ( ! nUnicode ) + { + // If the code is not converted, leave it as is + MCD_BLDAPPEND1(strText,'&'); + ++nChar; + } + } + else // not & + { + nCharLen = MCD_CLEN(&pSource[nChar]); + MCD_BLDAPPENDN(strText,&pSource[nChar],nCharLen); + nChar += nCharLen; + } + } + MCD_BLDRELEASE(strText); + return strText; +} + +bool CMarkup::DetectUTF8( const char* pText, int nTextLen, int* pnNonASCII/*=NULL*/ ) +{ + // return true if ASCII or all non-ASCII byte sequences are valid UTF-8 pattern: + // ASCII 0xxxxxxx + // 2-byte 110xxxxx 10xxxxxx + // 3-byte 1110xxxx 10xxxxxx 10xxxxxx + // 4-byte 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // *pnNonASCII is set (if pnNonASCII is not NULL) to the number of non-ASCII UTF-8 sequences + // or if an invalid UTF-8 sequence is found, to 1 + the valid sequences up to the invalid sequence + int nUChar; + if ( pnNonASCII ) + *pnNonASCII = 0; + const char* pTextEnd = pText + nTextLen; + while ( *pText && pText != pTextEnd ) + { + if ( (unsigned char)(*pText) & 0x80 ) + { + if ( pnNonASCII ) + ++(*pnNonASCII); + nUChar = DecodeCharUTF8( pText, pTextEnd ); + if ( nUChar == -1 ) + return false; + } + else + ++pText; + } + return true; +} + +int CMarkup::DecodeCharUTF8( const char*& pszUTF8, const char* pszUTF8End/*=NULL*/ ) +{ + // Return Unicode code point and increment pszUTF8 past 1-4 bytes + // pszUTF8End can be NULL if pszUTF8 is null terminated + int nUChar = (unsigned char)*pszUTF8; + ++pszUTF8; + if ( nUChar & 0x80 ) + { + int nExtraChars; + if ( ! (nUChar & 0x20) ) + { + nExtraChars = 1; + nUChar &= 0x1f; + } + else if ( ! (nUChar & 0x10) ) + { + nExtraChars = 2; + nUChar &= 0x0f; + } + else if ( ! (nUChar & 0x08) ) + { + nExtraChars = 3; + nUChar &= 0x07; + } + else + return -1; + while ( nExtraChars-- ) + { + if ( pszUTF8 == pszUTF8End || ! (*pszUTF8 & 0x80) ) + return -1; + nUChar = nUChar<<6; + nUChar |= *pszUTF8 & 0x3f; + ++pszUTF8; + } + } + return nUChar; +} + +void CMarkup::EncodeCharUTF16( int nUChar, unsigned short* pwszUTF16, int& nUTF16Len ) +{ + // Write UTF-16 sequence to pwszUTF16 for Unicode code point nUChar and update nUTF16Len + // Be sure pwszUTF16 has room for up to 2 wide chars + if ( nUChar & ~0xffff ) + { + if ( pwszUTF16 ) + { + // Surrogate pair + nUChar -= 0x10000; + pwszUTF16[nUTF16Len++] = (wchar_t)(((nUChar>>10) & 0x3ff) | 0xd800); // W1 + pwszUTF16[nUTF16Len++] = (wchar_t)((nUChar & 0x3ff) | 0xdc00); // W2 + } + else + nUTF16Len += 2; + } + else + { + if ( pwszUTF16 ) + pwszUTF16[nUTF16Len++] = (wchar_t)nUChar; + else + ++nUTF16Len; + } +} + +int CMarkup::DecodeCharUTF16( const unsigned short*& pwszUTF16, const unsigned short* pszUTF16End/*=NULL*/ ) +{ + // Return Unicode code point and increment pwszUTF16 past 1 or 2 (if surrogrates) UTF-16 code points + // pszUTF16End can be NULL if pszUTF16 is zero terminated + int nUChar = *pwszUTF16; + ++pwszUTF16; + if ( (nUChar & ~0x000007ff) == 0xd800 ) // W1 + { + if ( pwszUTF16 == pszUTF16End || ! (*pwszUTF16) ) // W2 + return -1; // incorrect UTF-16 + nUChar = (((nUChar & 0x3ff) << 10) | (*pwszUTF16 & 0x3ff)) + 0x10000; + ++pwszUTF16; + } + return nUChar; +} + +void CMarkup::EncodeCharUTF8( int nUChar, char* pszUTF8, int& nUTF8Len ) +{ + // Write UTF-8 sequence to pszUTF8 for Unicode code point nUChar and update nUTF8Len + // Be sure pszUTF8 has room for up to 4 bytes + if ( ! (nUChar & ~0x0000007f) ) // < 0x80 + { + if ( pszUTF8 ) + pszUTF8[nUTF8Len++] = (char)nUChar; + else + ++nUTF8Len; + } + else if ( ! (nUChar & ~0x000007ff) ) // < 0x800 + { + if ( pszUTF8 ) + { + pszUTF8[nUTF8Len++] = (char)(((nUChar&0x7c0)>>6)|0xc0); + pszUTF8[nUTF8Len++] = (char)((nUChar&0x3f)|0x80); + } + else + nUTF8Len += 2; + } + else if ( ! (nUChar & ~0x0000ffff) ) // < 0x10000 + { + if ( pszUTF8 ) + { + pszUTF8[nUTF8Len++] = (char)(((nUChar&0xf000)>>12)|0xe0); + pszUTF8[nUTF8Len++] = (char)(((nUChar&0xfc0)>>6)|0x80); + pszUTF8[nUTF8Len++] = (char)((nUChar&0x3f)|0x80); + } + else + nUTF8Len += 3; + } + else // < 0x110000 + { + if ( pszUTF8 ) + { + pszUTF8[nUTF8Len++] = (char)(((nUChar&0x1c0000)>>18)|0xf0); + pszUTF8[nUTF8Len++] = (char)(((nUChar&0x3f000)>>12)|0x80); + pszUTF8[nUTF8Len++] = (char)(((nUChar&0xfc0)>>6)|0x80); + pszUTF8[nUTF8Len++] = (char)((nUChar&0x3f)|0x80); + } + else + nUTF8Len += 4; + } +} + +int CMarkup::UTF16To8( char* pszUTF8, const unsigned short* pwszUTF16, int nUTF8Count ) +{ + // Supports the same arguments as wcstombs + // the pwszUTF16 source must be a NULL-terminated UTF-16 string + // if pszUTF8 is NULL, the number of bytes required is returned and nUTF8Count is ignored + // otherwise pszUTF8 is filled with the result string and NULL-terminated if nUTF8Count allows + // nUTF8Count is the byte size of pszUTF8 and must be large enough for the NULL if NULL desired + // and the number of bytes (excluding NULL) is returned + // + int nUChar, nUTF8Len = 0; + while ( *pwszUTF16 ) + { + // Decode UTF-16 + nUChar = DecodeCharUTF16( pwszUTF16, NULL ); + if ( nUChar == -1 ) + nUChar = '?'; + + // Encode UTF-8 + if ( pszUTF8 && nUTF8Len + 4 > nUTF8Count ) + { + int nUTF8LenSoFar = nUTF8Len; + EncodeCharUTF8( nUChar, NULL, nUTF8Len ); + if ( nUTF8Len > nUTF8Count ) + return nUTF8LenSoFar; + nUTF8Len = nUTF8LenSoFar; + } + EncodeCharUTF8( nUChar, pszUTF8, nUTF8Len ); + } + if ( pszUTF8 && nUTF8Len < nUTF8Count ) + pszUTF8[nUTF8Len] = 0; + return nUTF8Len; +} + +int CMarkup::UTF8To16( unsigned short* pwszUTF16, const char* pszUTF8, int nUTF8Count ) +{ + // Supports the same arguments as mbstowcs + // the pszUTF8 source must be a UTF-8 string which will be processed up to NULL-terminator or nUTF8Count + // if pwszUTF16 is NULL, the number of UTF-16 chars required is returned + // nUTF8Count is maximum UTF-8 bytes to convert and should include NULL if NULL desired in result + // if pwszUTF16 is not NULL it is filled with the result string and it must be large enough + // result will be NULL-terminated if NULL encountered in pszUTF8 before nUTF8Count + // and the number of UTF-8 bytes converted is returned + // + const char* pszPosUTF8 = pszUTF8; + const char* pszUTF8End = pszUTF8 + nUTF8Count; + int nUChar, nUTF8Len = 0, nUTF16Len = 0; + while ( pszPosUTF8 != pszUTF8End ) + { + nUChar = DecodeCharUTF8( pszPosUTF8, pszUTF8End ); + if ( ! nUChar ) + { + if ( pwszUTF16 ) + pwszUTF16[nUTF16Len] = 0; + break; + } + else if ( nUChar == -1 ) + nUChar = '?'; + + // Encode UTF-16 + EncodeCharUTF16( nUChar, pwszUTF16, nUTF16Len ); + } + nUTF8Len = (int)(pszPosUTF8 - pszUTF8); + if ( ! pwszUTF16 ) + return nUTF16Len; + return nUTF8Len; +} + +#if ! defined(MARKUP_WCHAR) // not WCHAR +MCD_STR CMarkup::UTF8ToA( MCD_CSTR pszUTF8, int* pnFailed/*=NULL*/ ) +{ + // Converts from UTF-8 to locale ANSI charset + MCD_STR strANSI; + int nMBLen = (int)MCD_PSZLEN( pszUTF8 ); + if ( pnFailed ) + *pnFailed = 0; + if ( nMBLen ) + { + ConvertEncoding convert( MCD_T(""), MCD_T("UTF-8"), (const void*)pszUTF8, nMBLen ); + convert.nToCount = nMBLen; + MCD_CHAR* pANSIBuffer = MCD_GETBUFFER(strANSI,convert.nToCount); + nMBLen = x_ConvertEncoding( convert, (void*)pANSIBuffer ); + MCD_RELEASEBUFFER(strANSI,pANSIBuffer,nMBLen); + if ( pnFailed ) + *pnFailed = convert.nFailedChars; + } + return strANSI; +} + +MCD_STR CMarkup::AToUTF8( MCD_CSTR pszANSI ) +{ + // Converts locale ANSI charset to UTF-8 + MCD_STR strUTF8; + int nMBLen = (int)MCD_PSZLEN( pszANSI ); + if ( nMBLen ) + { + ConvertEncoding convert( MCD_T("UTF-8"), MCD_T(""), (const void*)pszANSI, nMBLen ); + convert.nToCount = nMBLen * 4; + MCD_CHAR* pUTF8Buffer = MCD_GETBUFFER(strUTF8,convert.nToCount); + nMBLen = x_ConvertEncoding( convert, (void*)pUTF8Buffer ); + MCD_RELEASEBUFFER(strUTF8,pUTF8Buffer,nMBLen); + } + return strUTF8; +} +#endif // not WCHAR + +MCD_STR CMarkup::GetDeclaredEncoding( MCD_CSTR szDoc ) +{ + // Extract encoding attribute from XML Declaration, or HTML meta charset + MCD_STR strEncoding; + TokenPos token( szDoc, MDF_IGNORECASE ); + NodePos node; + bool bHtml = false; + int nTypeFound = 0; + while ( nTypeFound >= 0 ) + { + nTypeFound = x_ParseNode( token, node ); + int nNext = token.nNext; + if ( nTypeFound == MNT_PROCESSING_INSTRUCTION && token.nL == 0 ) + { + token.nNext = node.nStart + 2; // after + if ( x_FindAttrib( token, MCD_T("encoding") ) ) + strEncoding = x_GetToken( token ); + break; + } + } + else if ( nTypeFound == 0 ) // end tag + { + // Check for end of HTML head + token.nNext = node.nStart + 2; // after + int nAttribOffset = node.nStart + 1; + token.nNext = nAttribOffset; + if ( x_FindAttrib(token,MCD_T("http-equiv")) && token.Match(MCD_T("Content-Type")) ) + { + token.nNext = nAttribOffset; + if ( x_FindAttrib(token,MCD_T("content")) ) + { + int nContentEndOffset = token.nNext; + token.nNext = token.nL; + while ( token.nNext < nContentEndOffset && x_FindName(token) ) + { + if ( token.Match(MCD_T("charset")) && x_FindName(token) && token.Match(MCD_T("=")) ) + { + x_FindName( token ); + strEncoding = x_GetToken( token ); + break; + } + } + } + break; + } + } + } + token.nNext = nNext; + } + return strEncoding; +} + + +int CMarkup::FindNode( int nType ) +{ + // Change current node position only if a node is found + // If nType is 0 find any node, otherwise find node of type nType + // Return type of node or 0 if not found + // If found node is an element, change m_iPos + + // Determine where in document to start scanning for node + int nTypeFound = 0; + int nNodeOffset = m_nNodeOffset; + if ( m_nNodeType > 1 ) + { + // By-pass current node + nNodeOffset += m_nNodeLength; + } + else + { + // Set position to begin looking for node + nNodeOffset = 0; // default to start of document + if ( m_iPos ) + { + // After element + nNodeOffset = m_aPos[m_iPos].StartAfter(); + } + else if ( m_iPosParent ) + { + // Immediately after start tag of parent + if ( m_aPos[m_iPosParent].IsEmptyElement() ) + return 0; + else + nNodeOffset = m_aPos[m_iPosParent].StartContent(); + } + } + + // Get nodes until we find what we're looking for + int iPosNew = m_iPos; + TokenPos token( m_strDoc, m_nDocFlags ); + NodePos node; + token.nNext = nNodeOffset; + do + { + nNodeOffset = token.nNext; + nTypeFound = x_ParseNode( token, node ); + if ( nTypeFound == 0 ) + { + // Check if we have reached the end of the parent element + // Otherwise it is a lone end tag + if ( m_iPosParent && nNodeOffset == m_aPos[m_iPosParent].StartContent() + + m_aPos[m_iPosParent].ContentLen() ) + return 0; + nTypeFound = MNT_LONE_END_TAG; + } + else if ( nTypeFound < 0 ) + { + if ( nTypeFound == -2 ) + return 0; + // -1 is node error + nTypeFound = MNT_NODE_ERROR; + } + else if ( nTypeFound == MNT_ELEMENT ) + { + if ( iPosNew ) + iPosNew = m_aPos[iPosNew].iElemNext; + else + iPosNew = m_aPos[m_iPosParent].iElemChild; + if ( ! iPosNew ) + return 0; + if ( ! nType || (nType & nTypeFound) ) + { + // Found element node, move position to this element + x_SetPos( m_iPosParent, iPosNew, 0 ); + return m_nNodeType; + } + token.nNext = m_aPos[iPosNew].StartAfter(); + } + } + while ( nType && ! (nType & nTypeFound) ); + + m_iPos = iPosNew; + m_iPosChild = 0; + m_nNodeOffset = nNodeOffset; + m_nNodeLength = token.nNext - nNodeOffset; + m_nNodeType = nTypeFound; + MARKUP_SETDEBUGSTATE; + return m_nNodeType; +} + +bool CMarkup::RemoveNode() +{ + if ( m_iPos || m_nNodeLength ) + { + x_RemoveNode( m_iPosParent, m_iPos, m_nNodeType, m_nNodeOffset, m_nNodeLength ); + m_iPosChild = 0; + MARKUP_SETDEBUGSTATE; + return true; + } + return false; +} + +MCD_STR CMarkup::GetTagName() const +{ + // Return the tag name at the current main position + MCD_STR strTagName; + + // This method is primarily for elements, however + // it does return something for certain other nodes + if ( m_nNodeLength ) + { + switch ( m_nNodeType ) + { + case MNT_PROCESSING_INSTRUCTION: + case MNT_LONE_END_TAG: + { + // nMapSize); + SavedPos* pSavedPos = pMap->pTable[nSlot]; + int nOffset = 0; + if ( ! pSavedPos ) + { + pSavedPos = new SavedPos[2]; + pSavedPos[1].nSavedPosFlags = SavedPos::SPM_LAST; + pMap->pTable[nSlot] = pSavedPos; + } + else + { + while ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_USED ) + { + if ( pSavedPos[nOffset].strName == (MCD_PCSZ)szPosName ) + break; + if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_LAST ) + { + int nNewSize = (nOffset + 6) * 2; + SavedPos* pNewSavedPos = new SavedPos[nNewSize]; + for ( int nCopy=0; nCopy<=nOffset; ++nCopy ) + pNewSavedPos[nCopy] = pSavedPos[nCopy]; + pNewSavedPos[nOffset].nSavedPosFlags ^= SavedPos::SPM_LAST; + pNewSavedPos[nNewSize-1].nSavedPosFlags = SavedPos::SPM_LAST; + delete [] pSavedPos; + pSavedPos = pNewSavedPos; + pMap->pTable[nSlot] = pSavedPos; + ++nOffset; + break; + } + ++nOffset; + } + } + if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_LAST ) + savedpos.nSavedPosFlags |= SavedPos::SPM_LAST; + pSavedPos[nOffset] = savedpos; + + /* + // To review hash table balance, uncomment and watch strBalance + MCD_STR strBalance, strSlot; + for ( nSlot=0; nSlot < pMap->nMapSize; ++nSlot ) + { + pSavedPos = pMap->pTable[nSlot]; + int nCount = 0; + while ( pSavedPos && pSavedPos->nSavedPosFlags & SavedPos::SPM_USED ) + { + ++nCount; + if ( pSavedPos->nSavedPosFlags & SavedPos::SPM_LAST ) + break; + ++pSavedPos; + } + strSlot.Format( MCD_T("%d "), nCount ); + strBalance += strSlot; + } + */ + return true; + } + return false; +} + +bool CMarkup::RestorePos( MCD_CSTR szPosName /*=""*/, int nMap /*=0*/ ) +{ + // Restore element position if found in saved position map + if ( szPosName ) + { + SavedPosMap* pMap; + x_GetMap( pMap, nMap ); + int nSlot = x_Hash( szPosName, pMap->nMapSize ); + SavedPos* pSavedPos = pMap->pTable[nSlot]; + if ( pSavedPos ) + { + int nOffset = 0; + while ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_USED ) + { + if ( pSavedPos[nOffset].strName == (MCD_PCSZ)szPosName ) + { + int i = pSavedPos[nOffset].iPos; + if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_CHILD ) + x_SetPos( m_aPos[m_aPos[i].iElemParent].iElemParent, m_aPos[i].iElemParent, i ); + else if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_MAIN ) + x_SetPos( m_aPos[i].iElemParent, i, 0 ); + else + x_SetPos( i, 0, 0 ); + return true; + } + if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_LAST ) + break; + ++nOffset; + } + } + } + return false; +} + +bool CMarkup::SetMapSize( int nSize, int nMap /*=0*/ ) +{ + // Set saved position map hash table size before using it + // Returns false if map already exists + // Some prime numbers: 53, 101, 211, 503, 1009, 2003, 10007, 20011, 50021, 100003, 200003, 500009 + SavedPosMap* pNewMap; + return x_GetMap( pNewMap, nMap, nSize ); +} + +bool CMarkup::RemoveElem() +{ + // Remove current main position element + if ( m_iPos && m_nNodeType == MNT_ELEMENT ) + { + int iPos = x_RemoveElem( m_iPos ); + x_SetPos( m_iPosParent, iPos, 0 ); + return true; + } + return false; +} + +bool CMarkup::RemoveChildElem() +{ + // Remove current child position element + if ( m_iPosChild ) + { + int iPosChild = x_RemoveElem( m_iPosChild ); + x_SetPos( m_iPosParent, m_iPos, iPosChild ); + return true; + } + return false; +} + + +////////////////////////////////////////////////////////////////////// +// Private Methods +////////////////////////////////////////////////////////////////////// + +MCD_STR CMarkup::x_GetLastError() +{ + // strerror is has difficulties cross-platform + // VC++ leaves MCD_STRERROR undefined and uses FormatMessage + // Non-VC++ use strerror (even for MARKUP_WCHAR and convert) + // additional notes: + // _WIN32_WCE (Windows CE) has no strerror (Embedded VC++ uses FormatMessage) + // _MSC_VER >= 1310 (VC++ 2003/7.1) has _wcserror (but not used) + // + const int nErrorBufferSize = 100; + MCD_CHAR szError[nErrorBufferSize+1]; +#if defined(MCD_STRERROR) // C error routine +#if defined(MARKUP_WCHAR) + char szMBError[nErrorBufferSize+1]; + strncpy( szMBError, MCD_STRERROR, nErrorBufferSize ); + szMBError[nErrorBufferSize] = '\0'; + ConvertEncoding convert( MCD_ENC, MCD_T(""), (const void*)szMBError, strlen(szMBError) ); + convert.nToCount = nErrorBufferSize; + int nWideLen = x_ConvertEncoding( convert, (void*)szError ); + szError[nWideLen] = '\0'; +#else + MCD_PSZNCPY( szError, MCD_STRERROR, nErrorBufferSize ); + szError[nErrorBufferSize] = '\0'; +#endif +#else // no C error routine, use Windows API + if ( ::FormatMessage(0x1200,0,::GetLastError(),0,szError,nErrorBufferSize,0) < 1 ) + szError[0] = '\0'; +#endif // no C error routine + MCD_STR strError = szError; + for ( int nChar=0; nChar>1); // Grow By: multiply size by 1.5 + if ( m_aPos.GetSize() < nNewSize ) + { + // Grow By: new size can be at most one more complete segment + int nSeg = (m_aPos.GetSize()?m_aPos.GetSize()-1:0) >> m_aPos.PA_SEGBITS; + int nNewSeg = (nNewSize-1) >> m_aPos.PA_SEGBITS; + if ( nNewSeg > nSeg + 1 ) + { + nNewSeg = nSeg + 1; + nNewSize = (nNewSeg+1) << m_aPos.PA_SEGBITS; + } + + // Allocate array of segments + if ( m_aPos.nSegs <= nNewSeg ) + { + int nNewSegments = 4 + nNewSeg * 2; + char* pNewSegments = new char[nNewSegments*sizeof(char*)]; + if ( m_aPos.SegsUsed() ) + memcpy( pNewSegments, m_aPos.pSegs, m_aPos.SegsUsed()*sizeof(char*) ); + if ( m_aPos.pSegs ) + delete[] (char*)m_aPos.pSegs; + m_aPos.pSegs = (ElemPos**)pNewSegments; + m_aPos.nSegs = nNewSegments; + } + + // Calculate segment sizes + int nSegSize = m_aPos.GetSize() - (nSeg << m_aPos.PA_SEGBITS); + int nNewSegSize = nNewSize - (nNewSeg << m_aPos.PA_SEGBITS); + + // Complete first segment + int nFullSegSize = 1 << m_aPos.PA_SEGBITS; + if ( nSeg < nNewSeg && nSegSize < nFullSegSize ) + { + char* pNewFirstSeg = new char[ nFullSegSize * sizeof(ElemPos) ]; + if ( nSegSize ) + { + // Reallocate + memcpy( pNewFirstSeg, m_aPos.pSegs[nSeg], nSegSize * sizeof(ElemPos) ); + delete[] (char*)m_aPos.pSegs[nSeg]; + } + m_aPos.pSegs[nSeg] = (ElemPos*)pNewFirstSeg; + } + + // New segment + char* pNewSeg = new char[ nNewSegSize * sizeof(ElemPos) ]; + if ( nNewSeg == nSeg && nSegSize ) + { + // Reallocate + memcpy( pNewSeg, m_aPos.pSegs[nSeg], nSegSize * sizeof(ElemPos) ); + delete[] (char*)m_aPos.pSegs[nSeg]; + } + m_aPos.pSegs[nNewSeg] = (ElemPos*)pNewSeg; + m_aPos.nSize = nNewSize; + } + return true; +} + +bool CMarkup::x_ParseDoc() +{ + // Preserve pre-parse result + MCD_STR strResult = m_strError; + + // Reset indexes + ResetPos(); + m_SavedPosMapArray.RemoveAll(); + + // Starting size of position array: 1 element per 64 bytes of document + // Tight fit when parsing small doc, only 0 to 2 reallocs when parsing large doc + // Start at 8 when creating new document + int nDocLen = MCD_STRLENGTH(m_strDoc); + m_iPosFree = 1; + x_AllocPosArray( nDocLen / 64 + 8 ); + m_iPosDeleted = 0; + + // Parse document + m_aPos[0].ClearVirtualParent(); + if ( nDocLen ) + { + TokenPos token( m_strDoc, m_nDocFlags ); + int iPos = x_ParseElem( 0, token ); + m_aPos[0].nLength = nDocLen; + if ( iPos > 0 ) + { + m_aPos[0].iElemChild = iPos; + if ( m_aPos[iPos].iElemNext ) + m_strError = MCD_T("Root element has sibling"); + } + else + m_strError = MCD_T("No root element"); + } + else + m_strError = MCD_T("Empty document"); + + ResetPos(); + + // Combine preserved result with parse error + if ( ! MCD_STRISEMPTY(strResult) ) + { + if ( MCD_STRISEMPTY(m_strError) ) + m_strError = strResult; + else + m_strError = strResult + MCD_T(" ") + m_strError; + } + + return IsWellFormed(); +}; + +int CMarkup::x_ParseElem( int iPosParent, TokenPos& token ) +{ + // This is either called by x_ParseDoc or x_AddSubDoc or x_SetElemContent + // Returns index of the first element encountered or zero if no elements + // + int iElemRoot = 0; + int iPos = iPosParent; + int iVirtualParent = iPosParent; + int nRootDepth = m_aPos[iPos].Level(); + token.nNext = 0; + MCD_STRCLEAR(m_strError); + + // Loop through the nodes of the document + NodeStack aNodes; + aNodes.Add(); + int nDepth = 0; + int nMatchDepth; + int iPosChild; + int iPosMatch; + int nTypeFound = 0; + ElemPos* pElem; + int iElemFirst, iElemLast; + while ( 1 ) + { + nTypeFound = x_ParseNode( token, aNodes.Top() ); + nMatchDepth = 0; + if ( nTypeFound == MNT_ELEMENT ) // start tag + { + iPos = x_GetFreePos(); + if ( ! iElemRoot ) + iElemRoot = iPos; + pElem = &m_aPos[iPos]; + pElem->iElemParent = iPosParent; + pElem->iElemNext = 0; + if ( m_aPos[iPosParent].iElemChild ) + { + iElemFirst = m_aPos[iPosParent].iElemChild; + iElemLast = m_aPos[iElemFirst].iElemPrev; + m_aPos[iElemLast].iElemNext = iPos; + pElem->iElemPrev = iElemLast; + m_aPos[iElemFirst].iElemPrev = iPos; + pElem->nFlags = 0; + } + else + { + m_aPos[iPosParent].iElemChild = iPos; + pElem->iElemPrev = iPos; + pElem->nFlags = MNF_FIRST; + } + pElem->SetLevel( nRootDepth + nDepth ); + pElem->iElemChild = 0; + pElem->nStart = aNodes.Top().nStart; + pElem->SetStartTagLen( aNodes.Top().nLength ); + if ( aNodes.Top().nNodeFlags & MNF_EMPTY ) + { + iPos = iPosParent; + pElem->SetEndTagLen( 0 ); + pElem->nLength = aNodes.Top().nLength; + } + else + { + iPosParent = iPos; + ++nDepth; + aNodes.Add(); + } + } + else if ( nTypeFound == 0 ) // end tag + { + nMatchDepth = nDepth; + iPosMatch = iPos; + while ( nMatchDepth && ! token.Match(aNodes.At(nMatchDepth-1).strMeta) ) + { + /* + // Auto-switch case sensitivity + if ( ! (token.nTokenFlags & MDF_IGNORECASE ) ) + { + token.nTokenFlags |= MDF_IGNORECASE; + if ( token.Match(aNodes.At(nMatchDepth-1).strMeta) ) + break; + token.nTokenFlags |= MDF_IGNORECASE; + } + */ + --nMatchDepth; + iPosMatch = m_aPos[iPosMatch].iElemParent; + } + if ( nMatchDepth == 0 ) + { + // Not matched at all, it is a lone end tag, a non-element node + m_aPos[iVirtualParent].nFlags |= MNF_ILLFORMED; + m_aPos[iPos].nFlags |= MNF_ILLDATA; + if ( MCD_STRISEMPTY(m_strError) ) + { + m_strError = MCD_T("No start tag for end tag '"); + m_strError += x_GetToken(token); + m_strError += MCD_T("' at offset "); + m_strError += x_IntToStr( aNodes.Top().nStart ); + } + } + else + { + pElem = &m_aPos[iPosMatch]; + pElem->nLength = aNodes.Top().nStart - pElem->nStart + aNodes.Top().nLength; + pElem->SetEndTagLen( aNodes.Top().nLength ); + } + } + else if ( nTypeFound == -1 ) + { + m_aPos[iVirtualParent].nFlags |= MNF_ILLFORMED; + m_aPos[iPos].nFlags |= MNF_ILLDATA; + if ( MCD_STRISEMPTY(m_strError) ) + m_strError = aNodes.Top().strMeta; + } + + // Matched end tag, or end of document + if ( nMatchDepth || nTypeFound == -2 ) + { + if ( nDepth > nMatchDepth ) + m_aPos[iVirtualParent].nFlags |= MNF_ILLFORMED; + + // Process any non-ended elements + while ( nDepth > nMatchDepth ) + { + // Element with no end tag + pElem = &m_aPos[iPos]; + iPosChild = pElem->iElemChild; + iPosParent = pElem->iElemParent; + pElem->SetEndTagLen( 0 ); + pElem->nFlags |= MNF_NONENDED; + pElem->iElemChild = 0; + pElem->nLength = pElem->StartTagLen(); + if ( pElem->nFlags & MNF_ILLDATA ) + { + pElem->nFlags ^= MNF_ILLDATA; + m_aPos[iPosParent].nFlags |= MNF_ILLDATA; + } + while ( iPosChild ) + { + m_aPos[iPosChild].iElemParent = iPosParent; + m_aPos[iPosChild].iElemPrev = iPos; + m_aPos[iPos].iElemNext = iPosChild; + iPos = iPosChild; + iPosChild = m_aPos[iPosChild].iElemNext; + } + iPos = iPosParent; + aNodes.Remove(); + --nDepth; + + // Error string + // if end tag did not match, top node is end tag that did not match pElem + // if end of document, any nodes below top have no end tag + if ( MCD_STRISEMPTY(m_strError) ) + { + if ( nTypeFound == 0 ) + { + m_strError = MCD_T("End tag '") + x_GetToken(token); + m_strError += MCD_T("' at offset ") + x_IntToStr( token.nL-1 ); + m_strError += MCD_T(" does not match start tag '") + aNodes.Top().strMeta; + m_strError += MCD_T("' at offset ") + x_IntToStr( pElem->nStart ); + } + else + { + m_strError = MCD_T("Element '") + aNodes.Top().strMeta; + m_strError += MCD_T("' at offset ") + x_IntToStr( aNodes.Top().nStart ); + m_strError += MCD_T(" not ended"); + } + } + } + if ( nTypeFound == -2 ) + break; + iPosParent = m_aPos[iPos].iElemParent; + iPos = iPosParent; + aNodes.Remove(); + --nDepth; + } + } + return iElemRoot; +} + +bool CMarkup::x_FindAny( MCD_PCSZ pDoc, int& nChar ) +{ + // Starting at nChar, find a non-whitespace char + // return false if no non-whitespace before end of document, nChar points to end + // otherwise return true and nChar points to non-whitespace char + while ( pDoc[nChar] && MCD_PSZCHR(MCD_T(" \t\n\r"),pDoc[nChar]) ) + ++nChar; + return pDoc[nChar] != '\0'; +} + +bool CMarkup::x_FindName( CMarkup::TokenPos& token ) +{ + // Starting at token.nNext, bypass whitespace and find the next name + // returns true on success, members of token point to token + // returns false on end of document, members point to end of document + MCD_PCSZ pDoc = token.pDoc; + int nChar = token.nNext; + + // By-pass leading whitespace + if ( ! x_FindAny(pDoc,nChar) ) + { + // No token was found before end of document + token.nL = nChar; + token.nR = nChar - 1; + token.nNext = nChar; + return false; + } + + // Go until special char or whitespace + token.nL = nChar; + while ( pDoc[nChar] && ! MCD_PSZCHR(MCD_T(" \t\n\r<>=\\/?!\"';"),pDoc[nChar]) ) + nChar += MCD_CLEN(&pDoc[nChar]); + + // Adjust end position if it is one special char + if ( nChar == token.nL ) + ++nChar; // it is a special char + token.nR = nChar - 1; + + // nNext points to one past last char of token + token.nNext = nChar; + return true; +} + +MCD_STR CMarkup::x_GetToken( const CMarkup::TokenPos& token ) +{ + // The token contains indexes into the document identifying a small substring + // Build the substring from those indexes and return it + if ( token.nL > token.nR ) + return MCD_T(""); + MCD_STR strToken( &token.pDoc[token.nL], token.Length() ); + return strToken; +} + +int CMarkup::x_FindElem( int iPosParent, int iPos, MCD_PCSZ pPath ) const +{ + // If pPath is NULL or empty, go to next sibling element + // Otherwise go to next sibling element with matching path + // + if ( iPos ) + iPos = m_aPos[iPos].iElemNext; + else + iPos = m_aPos[iPosParent].iElemChild; + + // Finished here if pPath not specified + if ( pPath == NULL || !pPath[0] ) + return iPos; + + // Search + TokenPos token( m_strDoc, m_nDocFlags ); + while ( iPos ) + { + // Compare tag name + token.nNext = m_aPos[iPos].nStart + 1; + x_FindName( token ); // Locate tag name + if ( token.Match(pPath) ) + return iPos; + iPos = m_aPos[iPos].iElemNext; + } + return 0; + +} + +int CMarkup::x_ParseNode( CMarkup::TokenPos& token, CMarkup::NodePos& node ) +{ + // Call this with token.nNext set to the start of the node or tag + // Upon return token.nNext points to the char after the node or tag + // + // comment + // dtd + // processing instruction + // cdata section + // element start tag + // element end tag + // + // returns the nodetype or + // 0 for end tag + // -1 for bad node + // -2 for end of document + // + enum ParseBits + { + PD_OPENTAG = 1, + PD_BANG = 2, + PD_DASH = 4, + PD_BRACKET = 8, + PD_TEXTORWS = 16, + PD_DOCTYPE = 32, + PD_INQUOTE_S = 64, + PD_INQUOTE_D = 128, + PD_EQUALS = 256, + }; + int nParseFlags = 0; + + MCD_PCSZ pFindEnd = NULL; + int nNodeType = -1; + int nEndLen = 0; + int nName = 0; + unsigned int cDminus1 = 0, cDminus2 = 0; + #define FINDNODETYPE(e,t) { pFindEnd=e; nEndLen=(sizeof(e)-1)/sizeof(MCD_CHAR); nNodeType=t; } + #define FINDNODETYPENAME(e,t,n) { FINDNODETYPE(e,t) nName=(int)(pDoc-token.pDoc)+n-1; } + #define FINDNODEERR(e) node.strMeta=MCD_T("Incorrect "); node.strMeta+=e; node.strMeta+=MCD_T(" at offset ")+x_IntToStr(nR) + #define FINDNODEBAD(e) { pFindEnd=MCD_T(">"); nEndLen=1; FINDNODEERR(e); nNodeType=-1; } + + node.nStart = token.nNext; + node.nNodeFlags = 0; + + int nR = token.nNext; + MCD_PCSZ pDoc = &token.pDoc[nR]; + register unsigned int cD = (unsigned int)*pDoc; + if ( ! cD ) + { + node.nLength = 0; + node.nNodeType = 0; + return -2; // end of document + } + + while ( 1 ) + { + cD = (unsigned int)*pDoc; + if ( ! cD ) + { + nR = (int)(pDoc - token.pDoc) - 1; + if ( nNodeType != MNT_WHITESPACE && nNodeType != MNT_TEXT ) + { + MCD_PCSZ pType = MCD_T("tag"); + if ( (nParseFlags & PD_DOCTYPE) || nNodeType == MNT_DOCUMENT_TYPE ) + pType = MCD_T("Doctype"); + else if ( nNodeType == MNT_ELEMENT ) + pType = MCD_T("Element tag"); + else if ( nNodeType == 0 ) + pType = MCD_T("Element end tag"); + else if ( nNodeType == MNT_CDATA_SECTION ) + pType = MCD_T("CDATA Section"); + else if ( nNodeType == MNT_PROCESSING_INSTRUCTION ) + pType = MCD_T("Processing instruction"); + else if ( nNodeType == MNT_COMMENT ) + pType = MCD_T("Comment"); + nNodeType = -1; + node.strMeta = pType; + node.strMeta += MCD_T(" at offset ") + x_IntToStr(node.nStart); + node.strMeta += MCD_T(" unterminated"); + } + break; + } + + if ( nName ) + { + if ( MCD_PSZCHR(MCD_T(" \t\n\r/>"),(MCD_CHAR)cD) ) + { + int nNameLen = (int)(pDoc - token.pDoc) - nName; + if ( nNodeType == 0 ) + { + token.nL = nName; + token.nR = nName + nNameLen - 1; + } + else + { + MCD_STRASSIGN(node.strMeta,&token.pDoc[nName],nNameLen); + } + nName = 0; + cDminus2 = 0; + cDminus1 = 0; + } + else + { + pDoc += MCD_CLEN( pDoc ); + continue; + } + } + + if ( pFindEnd ) + { + if ( cD == '>' && ! (nParseFlags & (PD_INQUOTE_S|PD_INQUOTE_D)) ) + { + nR = (int)(pDoc - token.pDoc); + if ( nEndLen == 1 ) + { + pFindEnd = NULL; + if ( nNodeType == MNT_ELEMENT && cDminus1 == '/' ) + { + if ( (! cDminus2) || MCD_PSZCHR(MCD_T(" \t\n\r\'\""),(MCD_CHAR)cDminus2) ) + node.nNodeFlags |= MNF_EMPTY; + } + } + else if ( nR > nEndLen ) + { + // Test for end of PI or comment + MCD_PCSZ pEnd = pDoc - nEndLen + 1; + MCD_PCSZ pInFindEnd = pFindEnd; + int nLen = nEndLen; + while ( --nLen && *pEnd++ == *pInFindEnd++ ); + if ( nLen == 0 ) + pFindEnd = NULL; + } + if ( ! pFindEnd && ! (nParseFlags & PD_DOCTYPE) ) + break; + } + else if ( cD == '<' && (nNodeType == MNT_TEXT || nNodeType == -1) ) + { + nR = (int)(pDoc - token.pDoc) - 1; + break; + } + else if ( nNodeType & MNT_ELEMENT ) + { + if ( (nParseFlags & (PD_INQUOTE_S|PD_INQUOTE_D)) ) + { + if ( cD == '\"' && (nParseFlags&PD_INQUOTE_D) ) + nParseFlags ^= PD_INQUOTE_D; // off + else if ( cD == '\'' && (nParseFlags&PD_INQUOTE_S) ) + nParseFlags ^= PD_INQUOTE_S; // off + } + else // not in quotes + { + // Only set INQUOTE status when preceeded by equal sign + if ( cD == '\"' && (nParseFlags&PD_EQUALS) ) + nParseFlags ^= PD_INQUOTE_D|PD_EQUALS; // D on, equals off + else if ( cD == '\'' && (nParseFlags&PD_EQUALS) ) + nParseFlags ^= PD_INQUOTE_S|PD_EQUALS; // S on, equals off + else if ( cD == '=' && cDminus1 != '=' && ! (nParseFlags&PD_EQUALS) ) + nParseFlags ^= PD_EQUALS; // on + else if ( (nParseFlags&PD_EQUALS) && ! MCD_PSZCHR(MCD_T(" \t\n\r"),(MCD_CHAR)cD) ) + nParseFlags ^= PD_EQUALS; // off + } + cDminus2 = cDminus1; + cDminus1 = cD; + } + else if ( nNodeType & MNT_DOCUMENT_TYPE ) + { + if ( cD == '\"' && ! (nParseFlags&PD_INQUOTE_S) ) + nParseFlags ^= PD_INQUOTE_D; // toggle + else if ( cD == '\'' && ! (nParseFlags&PD_INQUOTE_D) ) + nParseFlags ^= PD_INQUOTE_S; // toggle + } + } + else if ( nParseFlags ) + { + if ( nParseFlags & PD_TEXTORWS ) + { + if ( cD == '<' ) + { + nR = (int)(pDoc - token.pDoc) - 1; + nNodeType = MNT_WHITESPACE; + break; + } + else if ( ! MCD_PSZCHR(MCD_T(" \t\n\r"),(MCD_CHAR)cD) ) + { + nParseFlags ^= PD_TEXTORWS; + FINDNODETYPE( MCD_T("<"), MNT_TEXT ) + } + } + else if ( nParseFlags & PD_OPENTAG ) + { + nParseFlags ^= PD_OPENTAG; + if ( cD > 0x60 || ( cD > 0x40 && cD < 0x5b ) || cD == 0x5f || cD == 0x3a ) + FINDNODETYPENAME( MCD_T(">"), MNT_ELEMENT, 1 ) + else if ( cD == '/' ) + FINDNODETYPENAME( MCD_T(">"), 0, 2 ) + else if ( cD == '!' ) + nParseFlags |= PD_BANG; + else if ( cD == '?' ) + FINDNODETYPENAME( MCD_T("?>"), MNT_PROCESSING_INSTRUCTION, 2 ) + else + FINDNODEBAD( MCD_T("tag name character") ) + } + else if ( nParseFlags & PD_BANG ) + { + nParseFlags ^= PD_BANG; + if ( cD == '-' ) + nParseFlags |= PD_DASH; + else if ( cD == '[' && !(nParseFlags & PD_DOCTYPE) ) + nParseFlags |= PD_BRACKET; + else if ( cD == 'D' && !(nParseFlags & PD_DOCTYPE) ) + nParseFlags |= PD_DOCTYPE; + else if ( MCD_PSZCHR(MCD_T("EAN"),(MCD_CHAR)cD) ) // "), MNT_DOCUMENT_TYPE ) + else + FINDNODEBAD( MCD_T("! tag") ) + } + else if ( nParseFlags & PD_DASH ) + { + nParseFlags ^= PD_DASH; + if ( cD == '-' ) + FINDNODETYPE( MCD_T("-->"), MNT_COMMENT ) + else + FINDNODEBAD( MCD_T("comment tag") ) + } + else if ( nParseFlags & PD_BRACKET ) + { + nParseFlags ^= PD_BRACKET; + if ( cD == 'C' ) + FINDNODETYPE( MCD_T("]]>"), MNT_CDATA_SECTION ) + else + FINDNODEBAD( MCD_T("tag") ) + } + else if ( nParseFlags & PD_DOCTYPE ) + { + if ( cD == '<' ) + nParseFlags |= PD_OPENTAG; + else if ( cD == '>' ) + { + nR = (int)(pDoc - token.pDoc); + nNodeType = MNT_DOCUMENT_TYPE; + break; + } + } + } + else if ( cD == '<' ) + { + nParseFlags |= PD_OPENTAG; + } + else + { + nNodeType = MNT_WHITESPACE; + if ( MCD_PSZCHR(MCD_T(" \t\n\r"),(MCD_CHAR)cD) ) + nParseFlags |= PD_TEXTORWS; + else + FINDNODETYPE( MCD_T("<"), MNT_TEXT ) + } + pDoc += MCD_CLEN( pDoc ); + } + token.nNext = nR + 1; + node.nLength = token.nNext - node.nStart; + node.nNodeType = nNodeType; + return nNodeType; +} + +MCD_STR CMarkup::x_GetPath( int iPos ) const +{ + MCD_STR strPath; + while ( iPos ) + { + MCD_STR strTagName = x_GetTagName( iPos ); + int iPosParent = m_aPos[iPos].iElemParent; + int iPosSib = 0; + int nCount = 0; + while ( iPosSib != iPos ) + { + iPosSib = x_FindElem( iPosParent, iPosSib, MCD_2PCSZ(strTagName) ); + ++nCount; + } + if ( nCount > 1 ) + { + MCD_CHAR szPred[25]; + MCD_SPRINTF( MCD_SSZ(szPred), MCD_T("[%d]"), nCount ); + strPath = MCD_T("/") + strTagName + szPred + strPath; + } + else + strPath = MCD_T("/") + strTagName + strPath; + iPos = iPosParent; + } + return strPath; +} + +MCD_STR CMarkup::x_GetTagName( int iPos ) const +{ + // Return the tag name at specified element + TokenPos token( m_strDoc, m_nDocFlags ); + token.nNext = m_aPos[iPos].nStart + 1; + if ( ! iPos || ! x_FindName( token ) ) + return MCD_T(""); + + // Return substring of document + return x_GetToken( token ); +} + +bool CMarkup::x_FindAttrib( CMarkup::TokenPos& token, MCD_PCSZ pAttrib, int n/*=0*/ ) +{ + // Return true if found, otherwise false and token.nNext is new insertion point + // If pAttrib is NULL find attrib n and leave token at attrib name + // If pAttrib is given, find matching attrib and leave token at value + // support non-well-formed attributes e.g. href=/advanced_search?hl=en, nowrap + // token also holds start and length of preceeding whitespace to support remove + // + int nPreSpaceStart; + int nPreSpaceLength; + int nChar; + MCD_CHAR cFirstChar; + MCD_PCSZ pDoc = token.pDoc; + int nAttrib = -1; // starts at tag name + int nFoundAttribNameR = 0; + bool bAfterEqual = false; + while ( 1 ) + { + // Starting at token.nNext, bypass whitespace and find the next token + nChar = token.nNext; + nPreSpaceStart = nChar; + if ( ! x_FindAny(pDoc,nChar) ) + break; + nPreSpaceLength = nChar - nPreSpaceStart; + + // Is it an opening quote? + cFirstChar = pDoc[nChar]; + if ( cFirstChar == '\"' || cFirstChar == '\'' ) + { + token.nTokenFlags |= MNF_QUOTED; + + // Move past opening quote + ++nChar; + token.nL = nChar; + + // Look for closing quote + while ( pDoc[nChar] && pDoc[nChar] != cFirstChar ) + nChar += MCD_CLEN( &pDoc[nChar] ); + + // Set right to before closing quote + token.nR = nChar - 1; + + // Set nChar past closing quote unless at end of document + if ( pDoc[nChar] ) + ++nChar; + } + else + { + token.nTokenFlags &= ~MNF_QUOTED; + + // Go until special char or whitespace + token.nL = nChar; + if ( bAfterEqual ) + { + while ( pDoc[nChar] && ! MCD_PSZCHR(MCD_T(" \t\n\r>"),pDoc[nChar]) ) + nChar += MCD_CLEN( &pDoc[nChar] ); + } + else + { + while ( pDoc[nChar] && ! MCD_PSZCHR(MCD_T("= \t\n\r>/?"),pDoc[nChar]) ) + nChar += MCD_CLEN( &pDoc[nChar] ); + } + + // Adjust end position if it is one special char + if ( nChar == token.nL ) + ++nChar; // it is a special char + token.nR = nChar - 1; + } + + // nNext points to one past last char of token + token.nNext = nChar; + + if ( ! bAfterEqual && ! (token.nTokenFlags&MNF_QUOTED) ) + { + // Is it an equal sign? + MCD_CHAR cChar = pDoc[token.nL]; + if ( cChar == '=' ) + { + bAfterEqual = true; + continue; + } + + // Is it the right angle bracket? + if ( cChar == '>' || cChar == '/' || cChar == '?' ) + { + token.nNext = nPreSpaceStart; + break; // attrib not found + } + + if ( nFoundAttribNameR ) + break; + + // Attribute name + if ( nAttrib != -1 ) + { + if ( ! pAttrib ) + { + if ( nAttrib == n ) + return true; // found by number + } + else if ( token.Match(pAttrib) ) + { + // Matched attrib name, go forward to value + nFoundAttribNameR = token.nR; + token.nPreSpaceStart = nPreSpaceStart; + token.nPreSpaceLength = nPreSpaceLength; + } + } + ++nAttrib; + } + else if ( nFoundAttribNameR ) + break; + bAfterEqual = false; + } + + if ( nFoundAttribNameR ) + { + if ( ! bAfterEqual ) + { + // when attribute has no value the value is the attribute name + token.nL = token.nPreSpaceStart + token.nPreSpaceLength; + token.nR = nFoundAttribNameR; + token.nNext = nFoundAttribNameR + 1; + } + return true; // found by name + } + return false; // not found +} + +MCD_STR CMarkup::x_GetAttrib( int iPos, MCD_PCSZ pAttrib ) const +{ + // Return the value of the attrib + TokenPos token( m_strDoc, m_nDocFlags ); + if ( iPos && m_nNodeType == MNT_ELEMENT ) + token.nNext = m_aPos[iPos].nStart + 1; + else if ( iPos == m_iPos && m_nNodeLength && m_nNodeType == MNT_PROCESSING_INSTRUCTION ) + token.nNext = m_nNodeOffset + 2; + else + return MCD_T(""); + + if ( pAttrib && x_FindAttrib( token, pAttrib ) ) + return UnescapeText( &token.pDoc[token.nL], token.Length() ); + return MCD_T(""); +} + +bool CMarkup::x_SetAttrib( int iPos, MCD_PCSZ pAttrib, int nValue, int nFlags /*=0*/ ) +{ + // Convert integer to string + MCD_CHAR szVal[25]; + MCD_SPRINTF( MCD_SSZ(szVal), MCD_T("%d"), nValue ); + return x_SetAttrib( iPos, pAttrib, szVal, nFlags ); +} + +bool CMarkup::x_SetAttrib( int iPos, MCD_PCSZ pAttrib, MCD_PCSZ pValue, int nFlags /*=0*/ ) +{ + // Set attribute in iPos element + TokenPos token( m_strDoc, m_nDocFlags ); + if ( iPos && m_nNodeType == MNT_ELEMENT ) + token.nNext = m_aPos[iPos].nStart + 1; + else if ( iPos == m_iPos && m_nNodeLength && m_nNodeType == MNT_PROCESSING_INSTRUCTION ) + token.nNext = m_nNodeOffset + 2; + else + return false; + + // Create insertion text depending on whether attribute already exists + // Decision: for empty value leaving attrib="" instead of removing attrib + int nReplace = 0; + int nInsertAt; + MCD_STR strInsert; + strInsert += x_ATTRIBQUOTE; + strInsert += EscapeText( pValue, MNF_ESCAPEQUOTES|nFlags ); + strInsert += x_ATTRIBQUOTE; + if ( x_FindAttrib( token, pAttrib ) ) + { + // Replace value + nInsertAt = token.nL - ((token.nTokenFlags&MNF_QUOTED)?1:0); + nReplace = token.Length() + ((token.nTokenFlags&MNF_QUOTED)?2:0); + } + else + { + // Insert string name value pair + MCD_STR strFormat; + strFormat = MCD_T(" "); + strFormat += pAttrib; + strFormat += MCD_T("="); + strFormat += strInsert; + strInsert = strFormat; + nInsertAt = token.nNext; + } + + x_DocChange( nInsertAt, nReplace, strInsert ); + int nAdjust = MCD_STRLENGTH(strInsert) - nReplace; + if ( m_nNodeType == MNT_PROCESSING_INSTRUCTION ) + { + x_AdjustForNode( m_iPosParent, m_iPos, nAdjust ); + m_nNodeLength += nAdjust; + MARKUP_SETDEBUGSTATE; + return true; + } + m_aPos[iPos].AdjustStartTagLen( nAdjust ); + m_aPos[iPos].nLength += nAdjust; + x_Adjust( iPos, nAdjust ); + MARKUP_SETDEBUGSTATE; + return true; +} + + +bool CMarkup::x_CreateNode( MCD_STR& strNode, int nNodeType, MCD_PCSZ pText ) +{ + // Set strNode based on nNodeType and szData + // Return false if szData would jeopardize well-formed document + // + switch ( nNodeType ) + { + case MNT_PROCESSING_INSTRUCTION: + strNode = MCD_T(""); + break; + case MNT_COMMENT: + strNode = MCD_T(""); + break; + case MNT_ELEMENT: + strNode = MCD_T("<"); + strNode += pText; + strNode += MCD_T("/>"); + break; + case MNT_TEXT: + case MNT_WHITESPACE: + strNode = EscapeText( pText ); + break; + case MNT_DOCUMENT_TYPE: + strNode = pText; + break; + case MNT_LONE_END_TAG: + return false; + case MNT_CDATA_SECTION: + if ( MCD_PSZSTR(pText,MCD_T("]]>")) != NULL ) + return false; + strNode = MCD_T(""); + break; + } + return true; +} + +MCD_STR CMarkup::x_EncodeCDATASection( MCD_PCSZ szData ) +{ + // Split CDATA Sections if there are any end delimiters + MCD_STR strData = MCD_T("") ); + while ( pszEnd ) + { + strData += MCD_STR( pszNextStart, (int)(pszEnd - pszNextStart) ); + strData += MCD_T("]]]]>"); + pszNextStart = pszEnd + 3; + pszEnd = MCD_PSZSTR( pszNextStart, MCD_T("]]>") ); + } + strData += pszNextStart; + strData += MCD_T("]]>"); + return strData; +} + +bool CMarkup::x_SetData( int iPos, int nValue ) +{ + // Convert integer to string + MCD_CHAR szVal[25]; + MCD_SPRINTF( MCD_SSZ(szVal), MCD_T("%d"), nValue ); + return x_SetData( iPos, szVal, 0 ); +} + +bool CMarkup::x_SetData( int iPos, MCD_PCSZ szData, int nFlags ) +{ + // Set data at specified position + // if nFlags==1, set content of element to a CDATA Section + MCD_STR strInsert; + + if ( iPos == m_iPos && m_nNodeLength ) + { + // Not an element + if ( ! x_CreateNode(strInsert, m_nNodeType, szData) ) + return false; + x_DocChange( m_nNodeOffset, m_nNodeLength, strInsert ); + x_AdjustForNode( m_iPosParent, iPos, MCD_STRLENGTH(strInsert) - m_nNodeLength ); + m_nNodeLength = MCD_STRLENGTH(strInsert); + MARKUP_SETDEBUGSTATE; + return true; + } + + // Set data in iPos element + if ( ! iPos || m_aPos[iPos].iElemChild ) + return false; + + // Build strInsert from szData based on nFlags + if ( nFlags & MNF_WITHCDATA ) + strInsert = x_EncodeCDATASection( szData ); + else + strInsert = EscapeText( szData, nFlags ); + + // Insert + NodePos node( MNF_WITHNOLINES|MNF_REPLACE ); + node.strMeta = strInsert; + int iPosBefore = 0; + int nReplace = x_InsertNew( iPos, iPosBefore, node ); + int nAdjust = MCD_STRLENGTH(node.strMeta) - nReplace; + x_Adjust( iPos, nAdjust ); + m_aPos[iPos].nLength += nAdjust; + if ( m_aPos[iPos].nFlags & MNF_ILLDATA ) + m_aPos[iPos].nFlags &= ~MNF_ILLDATA; + MARKUP_SETDEBUGSTATE; + return true; +} + +MCD_STR CMarkup::x_GetData( int iPos ) const +{ + if ( iPos == m_iPos && m_nNodeLength ) + { + if ( m_nNodeType == MNT_COMMENT ) + return MCD_STRMID( m_strDoc, m_nNodeOffset+4, m_nNodeLength-7 ); + else if ( m_nNodeType == MNT_PROCESSING_INSTRUCTION ) + return MCD_STRMID( m_strDoc, m_nNodeOffset+2, m_nNodeLength-4 ); + else if ( m_nNodeType == MNT_CDATA_SECTION ) + return MCD_STRMID( m_strDoc, m_nNodeOffset+9, m_nNodeLength-12 ); + else if ( m_nNodeType == MNT_TEXT ) + return UnescapeText( &(MCD_2PCSZ(m_strDoc))[m_nNodeOffset], m_nNodeLength ); + else if ( m_nNodeType == MNT_LONE_END_TAG ) + return MCD_STRMID( m_strDoc, m_nNodeOffset+2, m_nNodeLength-3 ); + else + return MCD_STRMID( m_strDoc, m_nNodeOffset, m_nNodeLength ); + } + + // Return a string representing data between start and end tag + // Return empty string if there are any children elements + MCD_STR strData; + if ( ! m_aPos[iPos].iElemChild && ! m_aPos[iPos].IsEmptyElement() ) + { + // Quick scan for any tags inside content + int nContentLen = m_aPos[iPos].ContentLen(); + int nStartContent = m_aPos[iPos].StartContent(); + MCD_PCSZ pszContent = &(MCD_2PCSZ(m_strDoc))[nStartContent]; + MCD_PCSZ pszTag = MCD_PSZCHR( pszContent, '<' ); + if ( pszTag && ((int)(pszTag-pszContent) < nContentLen) ) + { + // Concatenate all CDATA Sections and text nodes, ignore other nodes + TokenPos token( m_strDoc, m_nDocFlags ); + token.nNext = nStartContent; + NodePos node; + while ( token.nNext < nStartContent + nContentLen ) + { + x_ParseNode( token, node ); + if ( node.nNodeType == MNT_TEXT ) + strData += UnescapeText( &token.pDoc[node.nStart], node.nLength ); + else if ( node.nNodeType == MNT_CDATA_SECTION ) + strData += MCD_STRMID( m_strDoc, node.nStart+9, node.nLength-12 ); + } + } + else // no tags + strData = UnescapeText( &(MCD_2PCSZ(m_strDoc))[nStartContent], nContentLen ); + } + return strData; +} + +MCD_STR CMarkup::x_GetElemContent( int iPos ) const +{ + if ( iPos && m_aPos[iPos].ContentLen() ) + return MCD_STRMID( m_strDoc, m_aPos[iPos].StartContent(), m_aPos[iPos].ContentLen() ); + return MCD_T(""); +} + +bool CMarkup::x_SetElemContent( MCD_PCSZ szContent ) +{ + // Set data in iPos element only + if ( ! m_iPos ) + return false; + + if ( m_nNodeLength ) + return false; // not an element + + // Unlink all children + int iPos = m_iPos; + int iPosChild = m_aPos[iPos].iElemChild; + bool bHadChild = (iPosChild != 0); + while ( iPosChild ) + iPosChild = x_ReleaseSubDoc( iPosChild ); + if ( bHadChild ) + x_CheckSavedPos(); + + // Parse content + bool bWellFormed = true; + TokenPos token( szContent, m_nDocFlags ); + int iPosVirtual = x_GetFreePos(); + m_aPos[iPosVirtual].ClearVirtualParent(); + m_aPos[iPosVirtual].SetLevel( m_aPos[iPos].Level() + 1 ); + iPosChild = x_ParseElem( iPosVirtual, token ); + if ( m_aPos[iPosVirtual].nFlags & MNF_ILLFORMED ) + bWellFormed = false; + m_aPos[iPos].nFlags = (m_aPos[iPos].nFlags & ~MNF_ILLDATA) | (m_aPos[iPosVirtual].nFlags & MNF_ILLDATA); + + // Prepare insert and adjust offsets + NodePos node( MNF_WITHNOLINES|MNF_REPLACE ); + node.strMeta = szContent; + int iPosBefore = 0; + int nReplace = x_InsertNew( iPos, iPosBefore, node ); + + // Adjust and link in the inserted elements + x_Adjust( iPosChild, node.nStart ); + m_aPos[iPosChild].nStart += node.nStart; + m_aPos[iPos].iElemChild = iPosChild; + while ( iPosChild ) + { + m_aPos[iPosChild].iElemParent = iPos; + iPosChild = m_aPos[iPosChild].iElemNext; + } + x_ReleasePos( iPosVirtual ); + + int nAdjust = MCD_STRLENGTH(node.strMeta) - nReplace; + x_Adjust( iPos, nAdjust, true ); + m_aPos[iPos].nLength += nAdjust; + + x_SetPos( m_iPosParent, m_iPos, 0 ); + return bWellFormed; +} + +void CMarkup::x_DocChange( int nLeft, int nReplace, const MCD_STR& strInsert ) +{ + // Insert strInsert int m_strDoc at nLeft replacing nReplace chars + // When creating a document, reduce reallocs by reserving string space + // If realloc needed, allow for 1.5 times the new length + // + int nDocLength = MCD_STRLENGTH(m_strDoc); + int nInsLength = MCD_STRLENGTH(strInsert); + int nNewLength = nInsLength + nDocLength - nReplace; + int nAllocLen = MCD_STRCAPACITY(m_strDoc); +#if defined(MCD_STRINSERTREPLACE) // STL, replace method + if ( nNewLength > nAllocLen ) + MCD_BLDRESERVE( m_strDoc, (nNewLength + nNewLength/2 + 128) ); + MCD_STRINSERTREPLACE( m_strDoc, nLeft, nReplace, strInsert ); +#else // MFC, no replace method + int nBufferLen = nNewLength; + if ( nNewLength > nAllocLen ) + nBufferLen += nBufferLen/2 + 128; + MCD_CHAR* pDoc = MCD_GETBUFFER( m_strDoc, nBufferLen ); + if ( nInsLength != nReplace && nLeft+nReplace < nDocLength ) + memmove( &pDoc[nLeft+nInsLength], &pDoc[nLeft+nReplace], (nDocLength-nLeft-nReplace)*sizeof(MCD_CHAR) ); + memcpy( &pDoc[nLeft], strInsert, nInsLength*sizeof(MCD_CHAR) ); + MCD_RELEASEBUFFER( m_strDoc, pDoc, nNewLength ); +#endif // MFC, no replace method + +} + +void CMarkup::x_Adjust( int iPos, int nShift, bool bAfterPos /*=false*/ ) +{ + // Loop through affected elements and adjust indexes + // Algorithm: + // 1. update children unless bAfterPos + // (if no children or bAfterPos is true, length of iPos not affected) + // 2. update starts of next siblings and their children + // 3. go up until there is a next sibling of a parent and update starts + // 4. step 2 + int iPosTop = m_aPos[iPos].iElemParent; + bool bPosFirst = bAfterPos; // mark as first to skip its children + + // Stop when we've reached the virtual parent (which has no tags) + while ( m_aPos[iPos].StartTagLen() ) + { + // Were we at containing parent of affected position? + bool bPosTop = false; + if ( iPos == iPosTop ) + { + // Move iPosTop up one towards root + iPosTop = m_aPos[iPos].iElemParent; + bPosTop = true; + } + + // Traverse to the next update position + if ( ! bPosTop && ! bPosFirst && m_aPos[iPos].iElemChild ) + { + // Depth first + iPos = m_aPos[iPos].iElemChild; + } + else if ( m_aPos[iPos].iElemNext ) + { + iPos = m_aPos[iPos].iElemNext; + } + else + { + // Look for next sibling of a parent of iPos + // When going back up, parents have already been done except iPosTop + while ( 1 ) + { + iPos = m_aPos[iPos].iElemParent; + if ( iPos == iPosTop ) + break; + if ( m_aPos[iPos].iElemNext ) + { + iPos = m_aPos[iPos].iElemNext; + break; + } + } + } + bPosFirst = false; + + // Shift indexes at iPos + if ( iPos != iPosTop ) + m_aPos[iPos].nStart += nShift; + else + m_aPos[iPos].nLength += nShift; + } +} + +int CMarkup::x_InsertNew( int iPosParent, int& iPosRel, CMarkup::NodePos& node ) +{ + // Parent empty tag or tags with no content? + bool bEmptyParentTag = iPosParent && m_aPos[iPosParent].IsEmptyElement(); + bool bNoContentParentTags = iPosParent && ! m_aPos[iPosParent].ContentLen(); + if ( node.nLength ) + { + // Located at a non-element node + if ( ! (node.nNodeFlags & MNF_INSERT) ) + node.nStart += node.nLength; + } + else if ( iPosRel ) + { + // Located at an element + node.nStart = m_aPos[iPosRel].nStart; + if ( ! (node.nNodeFlags & MNF_INSERT) ) // follow iPosRel + node.nStart += m_aPos[iPosRel].nLength; + } + else if ( bEmptyParentTag ) + { + // Parent has no separate end tag, so split empty element + if ( m_aPos[iPosParent].nFlags & MNF_NONENDED ) + node.nStart = m_aPos[iPosParent].StartContent(); + else + node.nStart = m_aPos[iPosParent].StartContent() - 1; + } + else + { + if ( node.nNodeFlags & (MNF_INSERT|MNF_REPLACE) ) + node.nStart = m_aPos[iPosParent].StartContent(); + else // before end tag + node.nStart = m_aPos[iPosParent].StartAfter() - m_aPos[iPosParent].EndTagLen(); + } + + // Go up to start of next node, unless its splitting an empty element + if ( ! (node.nNodeFlags&(MNF_WITHNOLINES|MNF_REPLACE)) && ! bEmptyParentTag ) + { + MCD_PCSZ pDoc = MCD_2PCSZ(m_strDoc); + int nChar = node.nStart; + if ( ! x_FindAny(pDoc,nChar) || pDoc[nChar] == '<' ) + node.nStart = nChar; + } + + // Is insert relative to element position? (i.e. not other kind of node) + if ( ! node.nLength ) + { + // Modify iPosRel to reflect position before + if ( iPosRel ) + { + if ( node.nNodeFlags & MNF_INSERT ) + { + if ( ! (m_aPos[iPosRel].nFlags & MNF_FIRST) ) + iPosRel = m_aPos[iPosRel].iElemPrev; + else + iPosRel = 0; + } + } + else if ( ! (node.nNodeFlags & MNF_INSERT) ) + { + // If parent has a child, add after last child + if ( m_aPos[iPosParent].iElemChild ) + iPosRel = m_aPos[m_aPos[iPosParent].iElemChild].iElemPrev; + } + } + + // Get node length (used only by x_AddNode) + node.nLength = MCD_STRLENGTH(node.strMeta); + + // Prepare end of lines + if ( (! (node.nNodeFlags & MNF_WITHNOLINES)) && (bEmptyParentTag || bNoContentParentTags) ) + node.nStart += x_EOLLEN; + if ( ! (node.nNodeFlags & MNF_WITHNOLINES) ) + node.strMeta += x_EOL; + + // Calculate insert offset and replace length + int nReplace = 0; + int nInsertAt = node.nStart; + if ( bEmptyParentTag ) + { + MCD_STR strTagName = x_GetTagName( iPosParent ); + MCD_STR strFormat; + if ( node.nNodeFlags & MNF_WITHNOLINES ) + strFormat = MCD_T(">"); + else + strFormat = MCD_T(">") x_EOL; + strFormat += node.strMeta; + strFormat += MCD_T("value or + // + ElemPos* pElem = &m_aPos[iPos]; + int nLenName = MCD_PSZLEN(pName); + if ( ! pValue || ! pValue[0] ) + { + // empty element + node.strMeta = MCD_T("<"); + node.strMeta += pName; + if ( nFlags & MNF_WITHNOEND ) + { + node.strMeta += MCD_T(">"); + pElem->SetStartTagLen( nLenName + 2 ); + pElem->nLength = nLenName + 2; + } + else + { + if ( nFlags & MNF_WITHXHTMLSPACE ) + { + node.strMeta += MCD_T(" />"); + pElem->SetStartTagLen( nLenName + 4 ); + pElem->nLength = nLenName + 4; + } + else + { + node.strMeta += MCD_T("/>"); + pElem->SetStartTagLen( nLenName + 3 ); + pElem->nLength = nLenName + 3; + } + } + pElem->SetEndTagLen( 0 ); + } + else + { + // value + MCD_STR strValue; + if ( nFlags & MNF_WITHCDATA ) + strValue = x_EncodeCDATASection( pValue ); + else + strValue = EscapeText( pValue, nFlags ); + int nLenValue = MCD_STRLENGTH(strValue); + node.strMeta = MCD_T("<"); + node.strMeta += pName; + node.strMeta += MCD_T(">"); + node.strMeta += strValue; + node.strMeta += MCD_T(""); + pElem->SetEndTagLen( nLenName + 3 ); + pElem->nLength = nLenName * 2 + nLenValue + 5; + pElem->SetStartTagLen( nLenName + 2 ); + } + + // Insert + int nReplace = x_InsertNew( iPosParent, iPosBefore, node ); + + pElem->nStart = node.nStart; + pElem->iElemChild = 0; + if ( nFlags & MNF_WITHNOEND ) + pElem->nFlags = MNF_NONENDED; + else + pElem->nFlags = 0; + x_LinkElem( iPosParent, iPosBefore, iPos ); + + x_Adjust( iPos, MCD_STRLENGTH(node.strMeta) - nReplace ); + + if ( nFlags & MNF_CHILD ) + x_SetPos( m_iPosParent, iPosParent, iPos ); + else + x_SetPos( iPosParent, iPos, 0 ); + return true; +} + +MCD_STR CMarkup::x_GetSubDoc( int iPos ) const +{ + if ( iPos ) + { + int nStart = m_aPos[iPos].nStart; + int nNext = nStart + m_aPos[iPos].nLength; + MCD_PCSZ pDoc = MCD_2PCSZ(m_strDoc); + int nChar = nNext; + if ( ! x_FindAny(pDoc,nChar) || pDoc[nChar] == '<' ) + nNext = nChar; + return MCD_STRMID( m_strDoc, nStart, nNext - nStart ); + } + return MCD_T(""); +} + +bool CMarkup::x_AddSubDoc( MCD_PCSZ pSubDoc, int nFlags ) +{ + // Add subdocument, parse, and modify positions of affected elements + // + NodePos node( nFlags ); + int iPosParent, iPosBefore; + if ( nFlags & MNF_CHILD ) + { + // Add a subdocument under main position, before or after child + if ( ! m_iPos ) + return false; + iPosParent = m_iPos; + iPosBefore = m_iPosChild; + } + else + { + // Add a subdocument under parent position, before or after main + iPosParent = m_iPosParent; + iPosBefore = m_iPos; + node.nStart = m_nNodeOffset; + node.nLength = m_nNodeLength; + } + + // Parse subdocument + bool bWellFormed = true; + TokenPos token( pSubDoc, m_nDocFlags ); + int iPosVirtual = x_GetFreePos(); + m_aPos[iPosVirtual].ClearVirtualParent(); + m_aPos[iPosVirtual].SetLevel( m_aPos[iPosParent].Level() + 1 ); + int iPos = x_ParseElem( iPosVirtual, token ); + if ( (!iPos) || m_aPos[iPosVirtual].nFlags & MNF_ILLFORMED ) + bWellFormed = false; + if ( m_aPos[iPosVirtual].nFlags & MNF_ILLDATA ) + m_aPos[iPosParent].nFlags |= MNF_ILLDATA; + + // Extract subdocument without leading/trailing nodes + int nExtractStart = 0; + int iPosLast = m_aPos[iPos].iElemPrev; + if ( bWellFormed ) + { + nExtractStart = m_aPos[iPos].nStart; + int nExtractLength = m_aPos[iPos].nLength; + if ( iPos != iPosLast ) + { + nExtractLength = m_aPos[iPosLast].nStart - nExtractStart + m_aPos[iPosLast].nLength; + bWellFormed = false; // treat as subdoc here, but return not well-formed + } + MCD_STRASSIGN(node.strMeta,&pSubDoc[nExtractStart],nExtractLength); + } + else + { + node.strMeta = pSubDoc; + node.nNodeFlags |= MNF_WITHNOLINES; + } + + // Insert + int nReplace = x_InsertNew( iPosParent, iPosBefore, node ); + + // Adjust and link in the inserted elements + // iPosVirtual will stop it from affecting rest of document + int nAdjust = node.nStart - nExtractStart; + if ( iPos && nAdjust ) + { + x_Adjust( iPos, nAdjust ); + m_aPos[iPos].nStart += nAdjust; + } + int iPosChild = iPos; + while ( iPosChild ) + { + int iPosNext = m_aPos[iPosChild].iElemNext; + x_LinkElem( iPosParent, iPosBefore, iPosChild ); + iPosBefore = iPosChild; + iPosChild = iPosNext; + } + x_ReleasePos( iPosVirtual ); + + // Now adjust remainder of document + x_Adjust( iPosLast, MCD_STRLENGTH(node.strMeta) - nReplace, true ); + + // Set position to top element of subdocument + if ( nFlags & MNF_CHILD ) + x_SetPos( m_iPosParent, iPosParent, iPos ); + else // Main + x_SetPos( m_iPosParent, iPos, 0 ); + return bWellFormed; +} + +int CMarkup::x_RemoveElem( int iPos ) +{ + // Remove element and all contained elements + // Return new position + // + if ( ! iPos ) + return 0; + + // Determine whether any whitespace up to next tag + int nAfterEnd = m_aPos[iPos].StartAfter(); + MCD_PCSZ pDoc = MCD_2PCSZ(m_strDoc); + int nChar = nAfterEnd; + if ( ! x_FindAny(pDoc,nChar) || pDoc[nChar] == '<' ) + nAfterEnd = nChar; + + // Remove from document, adjust affected indexes, and unlink + int nLen = nAfterEnd - m_aPos[iPos].nStart; + x_DocChange( m_aPos[iPos].nStart, nLen, MCD_STR() ); + x_Adjust( iPos, - nLen, true ); + int iPosPrev = x_UnlinkElem( iPos ); + x_CheckSavedPos(); + return iPosPrev; +} + +void CMarkup::x_LinkElem( int iPosParent, int iPosBefore, int iPos ) +{ + // Link in element, and initialize nFlags, and iElem indexes + ElemPos* pElem = &m_aPos[iPos]; + pElem->iElemParent = iPosParent; + if ( iPosBefore ) + { + // Link in after iPosBefore + pElem->nFlags &= ~MNF_FIRST; + pElem->iElemNext = m_aPos[iPosBefore].iElemNext; + if ( pElem->iElemNext ) + m_aPos[pElem->iElemNext].iElemPrev = iPos; + else + m_aPos[m_aPos[iPosParent].iElemChild].iElemPrev = iPos; + m_aPos[iPosBefore].iElemNext = iPos; + pElem->iElemPrev = iPosBefore; + } + else + { + // Link in as first child + pElem->nFlags |= MNF_FIRST; + if ( m_aPos[iPosParent].iElemChild ) + { + pElem->iElemNext = m_aPos[iPosParent].iElemChild; + pElem->iElemPrev = m_aPos[pElem->iElemNext].iElemPrev; + m_aPos[pElem->iElemNext].iElemPrev = iPos; + m_aPos[pElem->iElemNext].nFlags ^= MNF_FIRST; + } + else + { + pElem->iElemNext = 0; + pElem->iElemPrev = iPos; + } + m_aPos[iPosParent].iElemChild = iPos; + } + if ( iPosParent ) + pElem->SetLevel( m_aPos[iPosParent].Level() + 1 ); +} + +int CMarkup::x_UnlinkElem( int iPos ) +{ + // Fix links to remove element and mark as deleted + // return previous position or zero if none + ElemPos* pElem = &m_aPos[iPos]; + + // Find previous sibling and bypass removed element + int iPosPrev = 0; + if ( pElem->nFlags & MNF_FIRST ) + { + if ( pElem->iElemNext ) // set next as first child + { + m_aPos[pElem->iElemParent].iElemChild = pElem->iElemNext; + m_aPos[pElem->iElemNext].iElemPrev = pElem->iElemPrev; + m_aPos[pElem->iElemNext].nFlags |= MNF_FIRST; + } + else // no children remaining + m_aPos[pElem->iElemParent].iElemChild = 0; + } + else + { + iPosPrev = pElem->iElemPrev; + m_aPos[iPosPrev].iElemNext = pElem->iElemNext; + if ( pElem->iElemNext ) + m_aPos[pElem->iElemNext].iElemPrev = iPosPrev; + else + m_aPos[m_aPos[pElem->iElemParent].iElemChild].iElemPrev = iPosPrev; + } + x_ReleaseSubDoc( iPos ); + return iPosPrev; +} + +int CMarkup::x_ReleasePos( int iPos ) +{ + int iPosNext = m_aPos[iPos].iElemNext; + m_aPos[iPos].iElemNext = m_iPosDeleted; + m_aPos[iPos].nFlags = MNF_DELETED; + m_iPosDeleted = iPos; + return iPosNext; +} + +int CMarkup::x_ReleaseSubDoc( int iPos ) +{ + // Mark position structures as deleted by depth first traversal + // Tricky because iElemNext used in traversal is overwritten for linked list of deleted + // Return value is what iElemNext was before being overwritten + // + int iPosNext = 0, iPosTop = iPos; + while ( 1 ) + { + if ( m_aPos[iPos].iElemChild ) + iPos = m_aPos[iPos].iElemChild; + else + { + while ( 1 ) + { + iPosNext = x_ReleasePos( iPos ); + if ( iPosNext || iPos == iPosTop ) + break; + iPos = m_aPos[iPos].iElemParent; + } + if ( iPos == iPosTop ) + break; + iPos = iPosNext; + } + } + return iPosNext; +} + +bool CMarkup::x_GetMap( SavedPosMap*& pMap, int nMap, int nMapSize /*=7*/ ) +{ + // Find or create map, returns true if map(s) created + SavedPosMap** ppMaps = m_SavedPosMapArray.pMaps; + int nMapIndex = 0; + if ( ppMaps ) + { + // Length of array is unknown, so loop through maps + while ( nMapIndex <= nMap ) + { + pMap = ppMaps[nMapIndex]; + if ( ! pMap ) + break; + if ( nMapIndex == nMap ) + return false; // not created + ++nMapIndex; + } + nMapIndex = 0; + } + + // Create map(s) + // If you access map 1 before map 0 created, then 2 maps will be created + m_SavedPosMapArray.pMaps = new SavedPosMap*[nMap+2]; + if ( ppMaps ) + { + while ( ppMaps[nMapIndex] ) + { + m_SavedPosMapArray.pMaps[nMapIndex] = ppMaps[nMapIndex]; + ++nMapIndex; + } + delete[] ppMaps; + } + ppMaps = m_SavedPosMapArray.pMaps; + while ( nMapIndex <= nMap ) + { + ppMaps[nMapIndex] = new SavedPosMap( nMapSize ); + ++nMapIndex; + } + ppMaps[nMapIndex] = NULL; + pMap = ppMaps[nMap]; + return true; // map(s) created +} + +void CMarkup::x_CheckSavedPos() +{ + // Remove any saved positions now pointing to deleted elements + // Must be done as part of element removal before position reassigned + if ( m_SavedPosMapArray.pMaps ) + { + int nMap = 0; + while ( m_SavedPosMapArray.pMaps[nMap] ) + { + SavedPosMap* pMap = m_SavedPosMapArray.pMaps[nMap]; + for ( int nSlot = 0; nSlot < pMap->nMapSize; ++nSlot ) + { + SavedPos* pSavedPos = pMap->pTable[nSlot]; + if ( pSavedPos ) + { + int nOffset = 0; + int nSavedPosCount = 0; + while ( 1 ) + { + if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_USED ) + { + int iPos = pSavedPos[nOffset].iPos; + if ( ! (m_aPos[iPos].nFlags & MNF_DELETED) ) + { + if ( nSavedPosCount < nOffset ) + { + pSavedPos[nSavedPosCount] = pSavedPos[nOffset]; + pSavedPos[nSavedPosCount].nSavedPosFlags &= ~SavedPos::SPM_LAST; + } + ++nSavedPosCount; + } + } + if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_LAST ) + { + while ( nSavedPosCount <= nOffset ) + pSavedPos[nSavedPosCount++].nSavedPosFlags &= ~SavedPos::SPM_USED; + break; + } + ++nOffset; + } + } + } + ++nMap; + } + } +} + +void CMarkup::x_AdjustForNode( int iPosParent, int iPos, int nShift ) +{ + // Adjust affected indexes + bool bAfterPos = true; + if ( ! iPos ) + { + // Change happened before or at first element under iPosParent + // If there are any children of iPosParent, adjust from there + // otherwise start at parent and adjust from there + iPos = m_aPos[iPosParent].iElemChild; + if ( iPos ) + { + m_aPos[iPos].nStart += nShift; + bAfterPos = false; + } + else + { + iPos = iPosParent; + m_aPos[iPos].nLength += nShift; + } + } + x_Adjust( iPos, nShift, bAfterPos ); +} + +bool CMarkup::x_AddNode( int nNodeType, MCD_PCSZ pText, int nNodeFlags ) +{ + // Only comments, DTDs, and processing instructions are followed by CRLF + // Other nodes are usually concerned with mixed content, so no CRLF + if ( ! (nNodeType & (MNT_PROCESSING_INSTRUCTION|MNT_COMMENT|MNT_DOCUMENT_TYPE)) ) + nNodeFlags |= MNF_WITHNOLINES; + + // Add node of nNodeType after current node position + NodePos node( nNodeFlags ); + if ( ! x_CreateNode(node.strMeta, nNodeType, pText) ) + return false; + + // Locate where to add node relative to current node + int iPosBefore = m_iPos; + int iPosParent = m_iPosParent; + node.nStart = m_nNodeOffset; + node.nLength = m_nNodeLength; + node.nNodeType = nNodeType; + + int nReplace = x_InsertNew( iPosParent, iPosBefore, node ); + + // If its a new element, create an ElemPos + int iPos = iPosBefore; + if ( nNodeType == MNT_ELEMENT ) + { + // Set indexes + iPos = x_GetFreePos(); + ElemPos* pElem = &m_aPos[iPos]; + pElem->nStart = node.nStart; + pElem->SetStartTagLen( node.nLength ); + pElem->SetEndTagLen( 0 ); + pElem->nLength = node.nLength; + node.nStart = 0; + node.nLength = 0; + pElem->iElemChild = 0; + pElem->nFlags = 0; + x_LinkElem( iPosParent, iPosBefore, iPos ); + } + + // Need to adjust element positions after iPos + x_AdjustForNode( iPosParent, iPos, MCD_STRLENGTH(node.strMeta) - nReplace ); + + // Set current position + m_iPos = iPos; + m_iPosChild = 0; + m_nNodeOffset = node.nStart; + m_nNodeLength = node.nLength; + m_nNodeType = nNodeType; + MARKUP_SETDEBUGSTATE; + return true; +} + +void CMarkup::x_RemoveNode( int iPosParent, int& iPos, int& nNodeType, int& nNodeOffset, int& nNodeLength ) +{ + // Remove node and return new position + // + int iPosPrev = iPos; + + // Removing an element? + if ( nNodeType == MNT_ELEMENT ) + { + nNodeOffset = m_aPos[iPos].nStart; + nNodeLength = m_aPos[iPos].nLength; + iPosPrev = x_UnlinkElem( iPos ); + x_CheckSavedPos(); + } + + // Find previous node type, offset and length + int nPrevOffset = 0; + if ( iPosPrev ) + nPrevOffset = m_aPos[iPosPrev].StartAfter(); + else if ( iPosParent ) + nPrevOffset = m_aPos[iPosParent].StartContent(); + TokenPos token( m_strDoc, m_nDocFlags ); + NodePos node; + token.nNext = nPrevOffset; + int nPrevType = 0; + while ( token.nNext < nNodeOffset ) + { + nPrevOffset = token.nNext; + nPrevType = x_ParseNode( token, node ); + } + int nPrevLength = nNodeOffset - nPrevOffset; + if ( ! nPrevLength ) + { + // Previous node is iPosPrev element + nPrevOffset = 0; + if ( iPosPrev ) + nPrevType = MNT_ELEMENT; + } + + // Remove node from document + x_DocChange( nNodeOffset, nNodeLength, MCD_STR() ); + x_AdjustForNode( iPosParent, iPosPrev, - nNodeLength ); + + // Was removed node a lone end tag? + if ( nNodeType == MNT_LONE_END_TAG ) + { + // See if we can unset parent MNF_ILLDATA flag + token.nNext = m_aPos[iPosParent].StartContent(); + int nEndOfContent = token.nNext + m_aPos[iPosParent].ContentLen(); + int iPosChild = m_aPos[iPosParent].iElemChild; + while ( token.nNext < nEndOfContent ) + { + if ( x_ParseNode(token,node) <= 0 ) + break; + if ( node.nNodeType == MNT_ELEMENT ) + { + token.nNext = m_aPos[iPosChild].StartAfter(); + iPosChild = m_aPos[iPosChild].iElemNext; + } + } + if ( token.nNext == nEndOfContent ) + m_aPos[iPosParent].nFlags &= ~MNF_ILLDATA; + } + + nNodeType = nPrevType; + nNodeOffset = nPrevOffset; + nNodeLength = nPrevLength; + iPos = iPosPrev; +} + +// Encoding names +// This is a precompiled ASCII hash table for speed and minimum memory requirement +// Each entry consists of a 2 digit name length, 5 digit code page, and the encoding name +// Each table slot can have multiple entries, table size 150 was chosen for even distribution +// +MCD_PCSZ EncodingNameTable[150] = +{ + MCD_T("2701148x-ebcdic-international-euro1028599iso_8859-9"), + MCD_T("0828596ecma-1141420284x-ebcdic-spain"), + MCD_T("0751949cseuckr1420127ansi_x3.4-19681420127ansi_x3.4-1986"),MCD_T(""), + MCD_T("1410001x-mac-japanese"),MCD_T("0500936cn-gb0828597ecma-118"), + MCD_T("0600932x-sjis"),MCD_T("0228591l1"), + MCD_T("0228592l20300936gbk0300437437"),MCD_T("1420280x-ebcdic-italy0228593l3"), + MCD_T("0628596arabic0228594l42701142x-ebcdic-denmarknorway-euro"), + MCD_T("0228595l5"),MCD_T(""),MCD_T("0700950cn-big50900932shift-jis"),MCD_T(""), + MCD_T("0800936csgb23120228605l9"),MCD_T(""),MCD_T(""),MCD_T("0920127iso646-us"), + MCD_T("0600850ibm850"),MCD_T("0800949ksc_56010600437ibm437"), + MCD_T("0600852ibm8520600861ibm861"),MCD_T("0620127ibm367"), + MCD_T("1420002x-chinese-eten0320866koi0600737ibm737"), + MCD_T("1410007x-mac-cyrillic1601146x-ebcdic-uk-euro1320107x-ia5-swedish0628591ibm819"), + MCD_T("0600775ibm775"), + MCD_T("0600866ibm8662750937x-ebcdic-traditionalchinese0600857ibm8571020127iso-ir-6us"), + MCD_T("1710002x-mac-chinesetrad"),MCD_T(""),MCD_T("0600869ibm869"),MCD_T(""), + MCD_T("0500437cspc8"),MCD_T("0551932x-euc1250221_iso-2022-jp1000932csshiftjis"), + MCD_T("2000875x-ebcdic-greekmodern3350939x-ebcdic-japaneseandjapaneselatin0601200utf-16"), + MCD_T("1320838x-ebcdic-thai"),MCD_T("1028591iso-ir-100"), + MCD_T("0800708asmo-7081028594iso-ir-1101028592iso-ir-101"), + MCD_T("0628598hebrew"),MCD_T("1057003x-iscii-be"), + MCD_T("1901145x-ebcdic-spain-euro1620127iso_646.irv:1991"), + MCD_T("1057002x-iscii-de0600949korean"), + MCD_T("1628596csisolatinarabic1710008x-mac-chinesesimp"),MCD_T(""), + MCD_T("1028595iso-ir-1440520866koi8r1028597iso-ir-1261057008x-iscii-ka"), + MCD_T("1028596iso-ir-1272420880x-ebcdic-cyrillicrussian1028593iso-ir-109"), + MCD_T("1000936gb_2312-801057009x-iscii-ma1128605iso_8859-15"), + MCD_T("1028598iso-ir-138"), + MCD_T("1901144x-ebcdic-italy-euro1028599iso-ir-1480928591iso8859-1"), + MCD_T("0928592iso8859-21057011x-iscii-pa1000949iso-ir-1490820127us-ascii"), + MCD_T("1220106x-ia5-german"),MCD_T(""), + MCD_T("2320833x-ebcdic-koreanextended1057006x-iscii-as"), + MCD_T("1057004x-iscii-ta"),MCD_T("1000932x-ms-cp932"),MCD_T(""), + MCD_T("0800936gb2312800721866koi8-ru"), + MCD_T("1057005x-iscii-te1300949csksc56011987"),MCD_T(""),MCD_T(""), + MCD_T("0400950big52650935x-ebcdic-simplifiedchinese1057010x-iscii-gu"), + MCD_T("0500775cp5000628598visual3321025x-ebcdic-cyrillicserbianbulgarian"), + MCD_T("1150221csiso2022jp1765000unicode-1-1-utf-7"), + MCD_T("1765001unicode-1-1-utf-81765001unicode-2-0-utf-8"), + MCD_T("4551932extended_unix_code_packed_format_for_japanese0900932shift_jis"), + MCD_T("1057007x-iscii-or1150225csiso2022kr"),MCD_T(""), + MCD_T("1951932cseucpkdfmtjapanese0501361johab0910000macintosh"), + MCD_T("0829001x-europa1620273x-ebcdic-germany"), + MCD_T("1628598csisolatinhebrew"),MCD_T("1320000x-chinese-cns0500437cp437"), + MCD_T("0500852cp8520500870cp870"),MCD_T("0520127ascii0520127cp367"), + MCD_T("1500420x-ebcdic-arabic"),MCD_T("0528591cp819"),MCD_T("0700720dos-720"), + MCD_T("1210004x-mac-arabic0500866cp8660700949ksc5601"), + MCD_T("0528597greek1965000x-unicode-2-0-utf-7"), + MCD_T("1400949ks_c_5601-19871965001x-unicode-2-0-utf-8"),MCD_T(""), + MCD_T("0420866koi82520290x-ebcdic-japanesekatakana1400949ks_c_5601-1989"), + MCD_T(""),MCD_T("1128592csisolatin20700862dos-862"), + MCD_T("1100437codepage4370201252us"),MCD_T("1128594csisolatin40700874tis-620"), + MCD_T("1128595csisolatin50700874dos-874"), + MCD_T("1501143x-ebcdic-france1200037ebcdic-cp-us1520108x-ia5-norwegian"), + MCD_T("1820873x-ebcdic-icelandic"), + MCD_T("1150220iso-2022-jp1100874windows-874"),MCD_T("0620866koi8-r"), + MCD_T("0800950x-x-big51510079x-mac-icelandic"), + MCD_T("2220278x-ebcdic-finlandsweden2850933x-ebcdic-japaneseanduscanada1620905x-ebcdic-turkish1150225iso-2022-kr"), + MCD_T("0621866koi8-u"),MCD_T("1028591iso-8859-1"), + MCD_T("1028592iso-8859-21310081x-mac-turkish"), + MCD_T("0828597elot_9280800932ms_kanji1028593iso-8859-30801252iso-ir-6"), + MCD_T("1028594iso-8859-40528593csiso1565000csunicode11utf7"), + MCD_T("1028595iso-8859-5"),MCD_T("1028596iso-8859-6"), + MCD_T("1028597iso-8859-73250933x-ebcdic-koreanandkoreanextended"), + MCD_T("1238598iso-8859-8-i1028598iso-8859-81520424x-ebcdic-hebrew1650222_iso-2022-jp$sio"), + MCD_T("0900936gb2312-801028599iso-8859-91101201unicodefffe"), + MCD_T("0600936gb2312"),MCD_T("1210005x-mac-hebrew"),MCD_T(""),MCD_T(""), + MCD_T("2101143x-ebcdic-germany-euro1210003x-mac-korean"),MCD_T(""), + MCD_T("0720866cskoi8r1528597csisolatingreek"),MCD_T(""), + MCD_T("1500936csiso58gb2312800828595cyrillic"),MCD_T(""),MCD_T(""), + MCD_T("0601026cp1026"),MCD_T(""),MCD_T(""), + MCD_T("0810029x-mac-ce0900949ks_c_5601"),MCD_T(""), + MCD_T("0601256cp12561052936hz-gb-23121528591iso_8859-1:1987"), + MCD_T("1528592iso_8859-2:1987"), + MCD_T("1000936csgb2312802450930x-ebcdic-japaneseandkana"), + MCD_T("1528593iso_8859-3:19880520105x-ia5"), + MCD_T("1528594iso_8859-4:19880651936euc-cn"), + MCD_T("1528596iso_8859-6:19871528595iso_8859-5:19882220277x-ebcdic-denmarknorway"), + MCD_T("0600950csbig51528597iso_8859-7:19872001147x-ebcdic-france-euro"), + MCD_T("1201250windows-12502301149x-ebcdic-icelandic-euro"), + MCD_T("1201251windows-12511528598iso_8859-8:1988"), + MCD_T("0801250x-cp12501201252windows-1252"), + MCD_T("0801251x-cp12511201253windows-12531400949ks_c_5601_19871528599iso_8859-9:1989"), + MCD_T("2701143x-ebcdic-finlandsweden-euro1201254windows-1254"), + MCD_T("1201255windows-1255"),MCD_T("1201256windows-12560651932euc-jp"), + MCD_T("1201257windows-12570738598logical"), + MCD_T("0628597greek81201258windows-1258"),MCD_T("0600949euc-kr"),MCD_T(""), + MCD_T("0700936chinese0565000utf-70720127csascii0628591latin1"), + MCD_T("0628592latin20851936x-euc-cn1120285x-ebcdic-uk0565001utf-8"), + MCD_T("0628593latin3"), + MCD_T("0628594latin41420423x-ebcdic-greek1901140x-ebcdic-cp-us-euro"), + MCD_T("0628599latin5"),MCD_T("1828595csisolatincyrillic"), + MCD_T("1110006x-mac-greek1100874iso-8859-11"), + MCD_T("1028591iso_8859-10601252x-ansi"), + MCD_T("1028592iso_8859-20628605latin90701200unicode"), + MCD_T("1028593iso_8859-3"), + MCD_T("1028594iso_8859-40851932x-euc-jp1128605iso-8859-15"), + MCD_T("1028595iso_8859-5"),MCD_T("1028596iso_8859-6"), + MCD_T("1028597iso_8859-7"), + MCD_T("0900936iso-ir-581028598iso_8859-81201255iso_8859-8-i1200932cswindows31j") +}; + +int CMarkup::x_GetEncodingCodePage( MCD_CSTR pszEncoding ) +{ + // redo for completeness, the iconv set, UTF-32, and uppercase + + // Lookup strEncoding in EncodingNameTable and return Windows code page + int nCodePage = -1; + int nEncLen = MCD_PSZLEN( pszEncoding ); + if ( ! nEncLen ) + nCodePage = MCD_ACP; + else if ( MCD_PSZNCMP(pszEncoding,MCD_T("UTF-32"),6) == 0 ) + nCodePage = MCD_UTF32; + else if ( nEncLen < 100 ) + { + MCD_CHAR szEncodingLower[100]; + for ( int nEncChar=0; nEncChar='A' && cEncChar<='Z')? (MCD_CHAR)(cEncChar+('a'-'A')) : cEncChar; + } + szEncodingLower[nEncLen] = '\0'; + MCD_PCSZ pEntry = EncodingNameTable[x_Hash(szEncodingLower,sizeof(EncodingNameTable)/sizeof(MCD_PCSZ))]; + while ( *pEntry ) + { + // e.g. entry: 0565001utf-8 means length 05, code page 65001, encoding name utf-8 + int nEntryLen = (*pEntry - '0') * 10; + ++pEntry; + nEntryLen += (*pEntry - '0'); + ++pEntry; + MCD_PCSZ pCodePage = pEntry; + pEntry += 5; + if ( nEntryLen == nEncLen && MCD_PSZNCMP(szEncodingLower,pEntry,nEntryLen) == 0 ) + { + // Convert digits to integer up to code name which always starts with alpha + nCodePage = MCD_PSZTOL( pCodePage, NULL, 10 ); + break; + } + pEntry += nEntryLen; + } + } + return nCodePage; +} + +#if ! defined(MARKUP_WCHAR) +bool CMarkup::x_CanConvert( MCD_CSTR pszToEncoding, MCD_CSTR pszFromEncoding ) +{ + // Return true if MB to MB conversion is possible +#if defined(MARKUP_ICONV) + // iconv_open should fail if either encoding not supported or one is alias for other + char szTo[100], szFrom[100]; + iconv_t cd = iconv_open( x_IConvName(szTo,pszToEncoding), x_IConvName(szFrom,pszFromEncoding) ); + if ( cd == (iconv_t)-1 ) + return false; + iconv_close(cd); +#else + int nToCP = x_GetEncodingCodePage( pszToEncoding ); + int nFromCP = x_GetEncodingCodePage( pszFromEncoding ); + if ( nToCP == -1 || nFromCP == -1 ) + return false; +#if defined(MARKUP_WINCONV) + if ( nToCP == MCD_ACP || nFromCP == MCD_ACP ) // either ACP ANSI? + { + int nACP = GetACP(); + if ( nToCP == MCD_ACP ) + nToCP = nACP; + if ( nFromCP == MCD_ACP ) + nFromCP = nACP; + } +#else // no conversion API, but we can do AToUTF8 and UTF8ToA + if ( nToCP != MCD_UTF8 && nFromCP != MCD_UTF8 ) // either UTF-8? + return false; +#endif // no conversion API + if ( nToCP == nFromCP ) + return false; +#endif // not ICONV + return true; +} +#endif // not WCHAR + +#if defined(MARKUP_ICONV) +const char* CMarkup::x_IConvName( char* szEncoding, MCD_CSTR pszEncoding ) +{ + // Make upper case char-based name from strEncoding which consists only of characters in the ASCII range + int nEncChar = 0; + while ( pszEncoding[nEncChar] ) + { + char cEncChar = (char)pszEncoding[nEncChar]; + szEncoding[nEncChar] = (cEncChar>='a' && cEncChar<='z')? (cEncChar-('a'-'A')) : cEncChar; + ++nEncChar; + } + szEncoding[nEncChar] = '\0'; + return szEncoding; +} + +int CMarkup::x_IConv( ConvertEncoding& convert, void* pTo, int nToCharSize, int nFromCharSize ) +{ + // Converts from any MB/UTF-8 to MB/UTF-8 + char szTo[100], szFrom[100]; + iconv_t cd = iconv_open( x_IConvName(szTo,convert.strToEncoding), x_IConvName(szFrom,convert.strFromEncoding) ); + int nToLenBytes = 0; + if ( cd != (iconv_t)-1 ) + { + size_t nFromLenRemaining = (size_t)convert.nFromLen * nFromCharSize; + size_t nToCountRemaining = (size_t)convert.nToCount * nToCharSize; + size_t nToCountRemainingBefore; + char* pToChar = (char*)pTo; + char* pFromChar = (char*)convert.pFrom; + char* pToTempBuffer = NULL; + const size_t nTempBufferSize = 2048; + size_t nResult; + if ( ! pTo ) + { + pToTempBuffer = new char[nTempBufferSize]; + pToChar = pToTempBuffer; + nToCountRemaining = nTempBufferSize; + } + while ( nFromLenRemaining ) + { + nToCountRemainingBefore = nToCountRemaining; + nResult = iconv( cd, &pFromChar, &nFromLenRemaining, &pToChar, &nToCountRemaining ); + nToLenBytes += (int)(nToCountRemainingBefore - nToCountRemaining); + if ( nResult == (size_t)-1 ) + { + // Bypass bad char, question mark denotes problem in source string + pFromChar += nFromCharSize; + nFromLenRemaining -= nFromCharSize; + if ( nToCharSize == 1 ) + *pToChar = '?'; + else if ( nToCharSize == 2 ) + *((unsigned short*)pToChar) = (unsigned short)'?'; + else if ( nToCharSize == 4 ) + *((unsigned int*)pToChar) = (unsigned int)'?'; + pToChar += nToCharSize; + nToCountRemaining -= nToCharSize; + } + else + convert.nFailedChars += nResult; + if ( pToTempBuffer && nToCountRemaining < 10 ) + { + nToCountRemaining = nTempBufferSize; + pToChar = pToTempBuffer; + } + } + if ( pToTempBuffer ) + delete[] pToTempBuffer; + iconv_close(cd); + } + return nToLenBytes / nToCharSize; +} +#endif + +int CMarkup::x_ConvertEncoding( ConvertEncoding& convert, void* pTo ) +{ + // If pTo is not NULL, it must be large enough to hold result, length of result is returned + // convert.nFailedChars will be set to >0 if characters not supported in strToEncoding + int nToLen = 0; + int nToCP = x_GetEncodingCodePage( convert.strToEncoding ); + if ( nToCP == -1 ) + nToCP = MCD_ACP; + int nFromCP = x_GetEncodingCodePage( convert.strFromEncoding ); + if ( nFromCP == -1 ) + nFromCP = MCD_ACP; + convert.nFailedChars = 0; + +#if ! defined(MARKUP_WINCONV) && ! defined(MARKUP_ICONV) + // Only non-Unicode encoding supported is locale charset, must call setlocale + if ( nToCP != MCD_UTF8 && nToCP != MCD_UTF16 && nToCP != MCD_UTF32 ) + nToCP = MCD_ACP; + if ( nFromCP != MCD_UTF8 && nFromCP != MCD_UTF16 && nFromCP != MCD_UTF32 ) + nFromCP = MCD_ACP; + if ( nFromCP == MCD_ACP ) + { + const char* pA = (const char*)convert.pFrom; + int nALenRemaining = convert.nFromLen; + int nCharLen; + wchar_t wcChar; + char* pU = (char*)pTo; + while ( nALenRemaining ) + { + nCharLen = mbtowc( &wcChar, pA, nALenRemaining ); + if ( nCharLen < 1 ) + { + wcChar = (wchar_t)'?'; + nCharLen = 1; + } + pA += nCharLen; + nALenRemaining -= nCharLen; + if ( nToCP == MCD_UTF8 ) + EncodeCharUTF8( (int)wcChar, pU, nToLen ); + else if ( nToCP == MCD_UTF16 ) + EncodeCharUTF16( (int)wcChar, (unsigned short*)pU, nToLen ); + else // UTF32 + { + if ( pU ) + ((unsigned int*)pU)[nToLen] = (unsigned int)wcChar; + ++nToLen; + } + } + } + else if ( nToCP == MCD_ACP ) + { + union pUnicodeUnion { const char* p8; const unsigned short* p16; const unsigned int* p32; } pU; + pU.p8 = (const char*)convert.pFrom; + const char* pUEnd = pU.p8 + convert.nFromLen; + if ( nFromCP == MCD_UTF16 ) + pUEnd = (char*)( pU.p16 + convert.nFromLen ); + else if ( nFromCP == MCD_UTF32 ) + pUEnd = (char*)( pU.p32 + convert.nFromLen ); + int nCharLen; + char* pA = (char*)pTo; + char szA[8]; + int nUChar; + while ( pU.p8 != pUEnd ) + { + if ( nFromCP == MCD_UTF8 ) + nUChar = DecodeCharUTF8( pU.p8, pUEnd ); + else if ( nFromCP == MCD_UTF16 ) + nUChar = DecodeCharUTF16( pU.p16, (const unsigned short*)pUEnd ); + else // UTF32 + nUChar = *(pU.p32)++; + if ( nUChar == -1 ) + nCharLen = -2; + else if ( nUChar & ~0xffff ) + nCharLen = -1; + else + nCharLen = wctomb( pA?pA:szA, (wchar_t)nUChar ); + if ( nCharLen < 0 ) + { + if ( nCharLen == -1 ) + ++convert.nFailedChars; + nCharLen = 1; + if ( pA ) + *pA = '?'; + } + if ( pA ) + pA += nCharLen; + nToLen += nCharLen; + } + } +#endif // not WINCONV and not ICONV + + if ( nFromCP == MCD_UTF32 ) + { + const unsigned int* p32 = (const unsigned int*)convert.pFrom; + const unsigned int* p32End = p32 + convert.nFromLen; + if ( nToCP == MCD_UTF8 ) + { + char* p8 = (char*)pTo; + while ( p32 != p32End ) + EncodeCharUTF8( *p32++, p8, nToLen ); + } + else if ( nToCP == MCD_UTF16 ) + { + unsigned short* p16 = (unsigned short*)pTo; + while ( p32 != p32End ) + EncodeCharUTF16( (int)*p32++, p16, nToLen ); + } + else // to ANSI + { + // WINCONV not supported for 32To8, since only used for sizeof(wchar_t) == 4 +#if defined(MARKUP_ICONV) + nToLen = x_IConv( convert, pTo, 1, 4 ); +#endif // ICONV + } + } + else if ( nFromCP == MCD_UTF16 ) + { + // UTF16To8 will be deprecated since weird output buffer size sensitivity not worth implementing here + const unsigned short* p16 = (const unsigned short*)convert.pFrom; + const unsigned short* p16End = p16 + convert.nFromLen; + int nUChar; + if ( nToCP == MCD_UTF32 ) + { + unsigned int* p32 = (unsigned int*)pTo; + while ( p16 != p16End ) + { + nUChar = DecodeCharUTF16( p16, p16End ); + if ( nUChar == -1 ) + nUChar = '?'; + if ( p32 ) + p32[nToLen] = (unsigned int)nUChar; + ++nToLen; + } + } +#if defined(MARKUP_WINCONV) + else // to UTF-8 or other multi-byte + { + nToLen = WideCharToMultiByte(nToCP,0,(const wchar_t*)convert.pFrom,convert.nFromLen,(char*)pTo, + convert.nToCount?convert.nToCount+1:0,NULL, + (nToCP==CP_UTF8)?NULL:&convert.nFailedChars); // fails with lpUsedDefaultChar and CP_UTF8 + } +#else // not WINCONV + else if ( nToCP == MCD_UTF8 ) + { + char* p8 = (char*)pTo; + while ( p16 != p16End ) + { + nUChar = DecodeCharUTF16( p16, p16End ); + if ( nUChar == -1 ) + nUChar = '?'; + EncodeCharUTF8( nUChar, p8, nToLen ); + } + } + else // to ANSI + { +#if defined(MARKUP_ICONV) + nToLen = x_IConv( convert, pTo, 1, 2 ); +#endif // ICONV + } +#endif // not WINCONV + } + else if ( nToCP == MCD_UTF16 ) // to UTF-16 from UTF-8/ANSI + { +#if defined(MARKUP_WINCONV) + nToLen = MultiByteToWideChar(nFromCP,0,(const char*)convert.pFrom,convert.nFromLen,(wchar_t*)pTo,convert.nToCount); +#else // not WINCONV + if ( nFromCP == MCD_UTF8 ) + { + const char* p8 = (const char*)convert.pFrom; + const char* p8End = p8 + convert.nFromLen; + int nUChar; + unsigned short* p16 = (unsigned short*)pTo; + while ( p8 != p8End ) + { + nUChar = DecodeCharUTF8( p8, p8End ); + if ( nUChar == -1 ) + nUChar = '?'; + if ( p16 ) + p16[nToLen] = (unsigned short)nUChar; + ++nToLen; + } + } + else // from ANSI + { +#if defined(MARKUP_ICONV) + nToLen = x_IConv( convert, pTo, 2, 1 ); +#endif // ICONV + } +#endif // not WINCONV + } + else if ( nToCP == MCD_UTF32 ) // to UTF-32 from UTF-8/ANSI + { + if ( nFromCP == MCD_UTF8 ) + { + const char* p8 = (const char*)convert.pFrom; + const char* p8End = p8 + convert.nFromLen; + int nUChar; + unsigned int* p32 = (unsigned int*)pTo; + while ( p8 != p8End ) + { + nUChar = DecodeCharUTF8( p8, p8End ); + if ( nUChar == -1 ) + nUChar = '?'; + if ( p32 ) + p32[nToLen] = (unsigned int)nUChar; + ++nToLen; + } + } + else // from ANSI + { + // WINCONV not supported for ATo32, since only used for sizeof(wchar_t) == 4 +#if defined(MARKUP_ICONV) + nToLen = x_IConv( convert, pTo, 4, 1 ); +#endif // ICONV + } + } + else + { +#if defined(MARKUP_ICONV) + nToLen = x_IConv( convert, pTo, 1, 1 ); +#elif defined(MARKUP_WINCONV) + wchar_t* pwszUTF16 = new wchar_t[convert.nFromLen]; + int nUTF16Len = MultiByteToWideChar(nFromCP,0,(const char*)convert.pFrom,convert.nFromLen,pwszUTF16,convert.nFromLen); + nToLen = WideCharToMultiByte(nToCP,0,pwszUTF16,nUTF16Len,(char*)pTo,convert.nToCount,NULL, + (nToCP==CP_UTF8)?NULL:&convert.nFailedChars); // cannot specify lpUsedDefaultChar if UTF-8! + delete[] pwszUTF16; +#endif // WINCONV + } + + // Store the length in case this is called again after allocating output buffer to fit + convert.nToCount = nToLen; + return nToLen; +} + +bool CMarkup::x_EndianSwapRequired( int nDocFlags ) +{ + short nWord = 1; + char cFirstByte = ((char*)&nWord)[0]; + if ( cFirstByte ) // LE + { + if ( nDocFlags & MDF_UTF16BEFILE ) + return true; + } + else if ( nDocFlags & MDF_UTF16LEFILE ) + return true; + return false; +} + +void CMarkup::x_EndianSwapUTF16( unsigned short* pBuffer, int nCharLen ) +{ + unsigned short cChar; + while ( nCharLen-- ) + { + cChar = pBuffer[nCharLen]; + pBuffer[nCharLen] = (unsigned short)((cChar<<8) | (cChar>>8)); + } +} + +struct BomTableStruct { const char* pszBom; int nBomLen; MCD_PCSZ pszBomEnc; int nBomFlag; } BomTable[] = +{ + { "\xef\xbb\xbf", 3, MCD_T("UTF-8"), CMarkup::MDF_UTF8PREAMBLE }, + { "\xff\xfe", 2, MCD_T("UTF-16LE"), CMarkup::MDF_UTF16LEFILE }, + { "\xfe\xff", 2, MCD_T("UTF-16BE"), CMarkup::MDF_UTF16BEFILE }, + { NULL,0,NULL,0 } +}; + +bool CMarkup::x_Open( MCD_CSTR_FILENAME szFileName, FilePos& file ) +{ + MCD_STRCLEAR( file.strIOResult ); + + // Open file + MCD_PCSZ_FILENAME pMode = MCD_T_FILENAME("rb"); + if ( file.nDocFlags & MDF_APPENDFILE ) + pMode = MCD_T_FILENAME("ab"); + else if ( file.nDocFlags & MDF_WRITEFILE ) + pMode = MCD_T_FILENAME("wb"); + file.fp = NULL; + MCD_FOPEN( file.fp, szFileName, pMode ); + if ( ! file.fp ) + { + file.strIOResult = x_GetLastError(); + return false; + } + + // Prepare file + bool bSuccess = true; + int nBomLen = 0; + if ( file.nDocFlags & MDF_READFILE ) + { + // Get file length + fseek( file.fp, 0, SEEK_END ); + file.nFileByteLen = ftell( file.fp ); + fseek( file.fp, 0, SEEK_SET ); + + // Read the top of the file to check BOM and encoding + int nReadTop = 1024; + if ( file.nFileByteLen < nReadTop ) + nReadTop = file.nFileByteLen; + if ( nReadTop ) + { + char* pFileTop = new char[nReadTop]; + if ( nReadTop ) + bSuccess = ( fread( pFileTop, nReadTop, 1, file.fp ) == 1 ); + if ( bSuccess ) + { + // Check for Byte Order Mark (preamble) + int nBomCheck = 0; + file.nDocFlags &= ~( MDF_UTF16LEFILE | MDF_UTF8PREAMBLE ); + file.nFileCharUnitSize = 1; // unless UTF-16 BOM found + while ( BomTable[nBomCheck].pszBom ) + { + while ( nBomLen < BomTable[nBomCheck].nBomLen ) + { + if ( nBomLen >= nReadTop || pFileTop[nBomLen] != BomTable[nBomCheck].pszBom[nBomLen] ) + break; + ++nBomLen; + } + if ( nBomLen == BomTable[nBomCheck].nBomLen ) + { + file.nDocFlags |= BomTable[nBomCheck].nBomFlag; + file.nFileByteLen -= nBomLen; + if ( nBomLen == 2 ) + file.nFileCharUnitSize = 2; + file.strEncoding = BomTable[nBomCheck].pszBomEnc; + break; + } + ++nBomCheck; + nBomLen = 0; + } + if ( nReadTop > nBomLen ) + fseek( file.fp, nBomLen, SEEK_SET ); + + // Encoding check + if ( ! nBomLen ) + { + MCD_STR strDeclCheck; +#if defined(MARKUP_WCHAR) // WCHAR + ConvertEncoding convert( MCD_ENC, MCD_T("UTF-8"), (const void*)pFileTop, nReadTop ); + MCD_CHAR* pWideBuffer = MCD_GETBUFFER(strDeclCheck,nReadTop); + convert.nToCount = nReadTop; + int nDeclWideLen = x_ConvertEncoding( convert, (void*)pWideBuffer ); + MCD_RELEASEBUFFER(strDeclCheck,pWideBuffer,nDeclWideLen); +#else // not WCHAR + MCD_STRASSIGN(strDeclCheck,pFileTop,nReadTop); +#endif // not WCHAR + file.strEncoding = GetDeclaredEncoding( strDeclCheck ); + } + // Assume markup files starting with < sign are UTF-8 if otherwise unknown + if ( MCD_STRISEMPTY(file.strEncoding) && pFileTop[0] == '<' ) + file.strEncoding = MCD_T("UTF-8"); + } + delete [] pFileTop; + } + } + else if ( file.nDocFlags & MDF_WRITEFILE ) + { + file.nFileCharUnitSize = 1; + if ( file.nDocFlags & MDF_APPENDFILE ) + file.nFileByteLen = ftell( file.fp ); + if ( ! file.nFileByteLen ) + { + int nBomCheck = 0; + while ( BomTable[nBomCheck].pszBom ) + { + if ( file.nDocFlags & BomTable[nBomCheck].nBomFlag ) + { + nBomLen = BomTable[nBomCheck].nBomLen; + if ( nBomLen == 2 ) + { + file.nFileCharUnitSize = 2; + file.strEncoding = BomTable[nBomCheck].pszBomEnc; + } + bSuccess = ( fwrite(BomTable[nBomCheck].pszBom,nBomLen,1,file.fp) == 1 ); + break; + } + ++nBomCheck; + } + } + } + if ( ! bSuccess ) + { + file.strIOResult = x_GetLastError(); + return false; + } + + if ( ! MCD_STRISEMPTY(file.strEncoding) ) + { + file.strIOResult = file.strEncoding; + if ( nBomLen ) + file.strIOResult += MCD_T(" BOM +"); + file.strIOResult += MCD_T(" "); + } + return bSuccess; +} + +bool CMarkup::x_Read( void* pBuffer, FilePos& file ) +{ + MCD_CHAR szReadInfo[100] = {0}; + bool bSuccess = ( fread( pBuffer,file.nReadByteLen,1,file.fp) == 1 ); + file.nFileTextLen = file.nReadByteLen / file.nFileCharUnitSize; + if ( bSuccess ) + { + MCD_SPRINTF( MCD_SSZ(szReadInfo), MCD_T("length %d "), file.nFileTextLen ); + file.strIOResult = szReadInfo; + + // Microsoft components can produce apparently valid docs with some nulls at ends of values + int nNullCount = 0; + int nNullCheckCharsRemaining = file.nFileTextLen; + char* pAfterNull = NULL; + char* pNullScan = (char*)pBuffer; + bool bSingleByteChar = file.nFileCharUnitSize == 1; + while ( nNullCheckCharsRemaining-- ) + { + if ( bSingleByteChar? (! *pNullScan) : (! (*(unsigned short*)pNullScan)) ) + { + if ( pAfterNull && pNullScan != pAfterNull ) + memmove( pAfterNull - (nNullCount*file.nFileCharUnitSize), pAfterNull, pNullScan - pAfterNull ); + pAfterNull = pNullScan + file.nFileCharUnitSize; + ++nNullCount; + } + pNullScan += file.nFileCharUnitSize; + } + if ( pAfterNull && pNullScan != pAfterNull ) + memmove( pAfterNull - (nNullCount*file.nFileCharUnitSize), pAfterNull, pNullScan - pAfterNull ); + if ( nNullCount ) + { + MCD_SPRINTF( MCD_SSZ(szReadInfo), MCD_T("(%d nulls removed) "), nNullCount ); + file.strIOResult += szReadInfo; + file.nFileTextLen -= nNullCount; + } + + // Big endian/little endian conversion + if ( file.nFileCharUnitSize > 1 && x_EndianSwapRequired(file.nDocFlags) ) + { + x_EndianSwapUTF16( (unsigned short*)pBuffer, file.nFileTextLen ); + file.strIOResult += MCD_T("endian swap "); + } + } + if ( ! bSuccess ) + file.strIOResult = x_GetLastError(); + return bSuccess; +} + +bool CMarkup::x_ReadText( MCD_STR& strDoc, FilePos& file ) +{ + bool bSuccess = true; + MCD_CHAR szReadInfo[100] = {0}; + MCD_STRCLEAR( file.strIOResult ); + if ( ! file.nReadByteLen ) + { + file.strIOResult = MCD_T("0 length "); + return bSuccess; + } + + if ( file.nDocFlags & (MDF_UTF16LEFILE | MDF_UTF16BEFILE) ) + { + int nUTF16Len = file.nReadByteLen / 2; +#if defined(MARKUP_WCHAR) // WCHAR + int nBufferSizeForGrow = nUTF16Len + nUTF16Len/100; // extra 1% +#if MARKUP_SIZEOFWCHAR == 4 // sizeof(wchar_t) == 4 + unsigned short* pUTF16Buffer = new unsigned short[nUTF16Len+1]; + bSuccess = x_Read( pUTF16Buffer, file ); + if ( bSuccess ) + { + ConvertEncoding convert( MCD_T("UTF-32"), MCD_T("UTF-16"), (const void*)pUTF16Buffer, nUTF16Len ); + convert.nToCount = nBufferSizeForGrow; + MCD_CHAR* pUTF32Buffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow); + int nUTF32Len = x_ConvertEncoding( convert, (void*)pUTF32Buffer ); + MCD_RELEASEBUFFER(strDoc,pUTF32Buffer,nUTF32Len); + MCD_SPRINTF( MCD_SSZ(szReadInfo), MCD_T("to UTF-32 length %d "), nUTF32Len ); + } +#else // sizeof(wchar_t) == 2 + MCD_CHAR* pUTF16Buffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow); + bSuccess = x_Read( pUTF16Buffer, file ); + MCD_RELEASEBUFFER(strDoc,pUTF16Buffer,file.nFileTextLen); +#endif // sizeof(wchar_t) == 2 +#else // not WCHAR + // Convert file from UTF-16; it needs to be in memory as UTF-8 or MBCS + unsigned short* pUTF16Buffer = new unsigned short[nUTF16Len+1]; + bSuccess = x_Read( pUTF16Buffer, file ); + nUTF16Len = file.nFileTextLen; + ConvertEncoding convert( MCD_ENC, MCD_T("UTF-16"), (const void*)pUTF16Buffer, nUTF16Len ); + int nMBLen = x_ConvertEncoding( convert, NULL ); + int nBufferSizeForGrow = nMBLen + nMBLen/100; // extra 1% + MCD_CHAR* pMBBuffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow); + x_ConvertEncoding( convert, (void*)pMBBuffer ); + delete [] pUTF16Buffer; + MCD_RELEASEBUFFER(strDoc,pMBBuffer,nMBLen); + MCD_SPRINTF( MCD_SSZ(szReadInfo), MCD_T("to %d bytes "), nMBLen ); + if ( convert.nFailedChars ) + MCD_PSZCAT( szReadInfo, MCD_T("(chars lost in conversion!) ") ); +#endif // not WCHAR + } + else // single or multibyte file (i.e. not UTF-16) + { +#if defined(MARKUP_WCHAR) // WCHAR + char* pBuffer = new char[file.nReadByteLen]; + bSuccess = x_Read( pBuffer, file ); + if ( MCD_STRISEMPTY(file.strEncoding) ) + { + if ( DetectUTF8(pBuffer,file.nReadByteLen) ) + file.strEncoding = MCD_T("UTF-8"); + file.strIOResult += MCD_T("(used UTF-8 detection) "); + } + ConvertEncoding convert( MCD_ENC, file.strEncoding, (const void*)pBuffer, file.nFileTextLen ); + int nWideLen = x_ConvertEncoding( convert, NULL ); + int nBufferSizeForGrow = nWideLen + nWideLen/100; // extra 1% + MCD_CHAR* pWideBuffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow); + x_ConvertEncoding( convert, (void*)pWideBuffer ); + MCD_RELEASEBUFFER( strDoc, pWideBuffer, nWideLen ); + delete [] pBuffer; + MCD_SPRINTF( MCD_SSZ(szReadInfo), MCD_T("to %d wide chars "), nWideLen ); +#else // not WCHAR + // After loading a file with unknown multi-byte encoding + bool bAssumeUnknownIsNative = false; + if ( MCD_STRISEMPTY(file.strEncoding) ) + { + bAssumeUnknownIsNative = true; + file.strEncoding = MCD_ENC; + } + if ( x_CanConvert(MCD_ENC,file.strEncoding) ) + { + char* pBuffer = new char[file.nReadByteLen]; + bSuccess = x_Read( pBuffer, file ); + ConvertEncoding convert( MCD_ENC, file.strEncoding, (const void*)pBuffer, file.nFileTextLen ); + int nMBLen = x_ConvertEncoding( convert, NULL ); + int nBufferSizeForGrow = nMBLen + nMBLen/100; // extra 1% + MCD_CHAR* pMBBuffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow); + x_ConvertEncoding( convert, (void*)pMBBuffer ); + MCD_RELEASEBUFFER( strDoc, pMBBuffer, nMBLen ); + delete [] pBuffer; + MCD_SPRINTF( MCD_SSZ(szReadInfo), MCD_T("to length %d "), nMBLen ); + if ( convert.nFailedChars ) + MCD_PSZCAT( szReadInfo, MCD_T("(chars lost in conversion!) ") ); + } + else // load directly into string + { + int nBufferSizeForGrow = file.nReadByteLen + file.nReadByteLen/100; // extra 1% + MCD_CHAR* pBuffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow); + bSuccess = x_Read( pBuffer, file ); + MCD_RELEASEBUFFER( strDoc, pBuffer, file.nFileTextLen ); + + // Might need additional conversion if we assumed an encoding + if ( bAssumeUnknownIsNative ) + { + int nNonASCII; + bool bIsUTF8 = DetectUTF8( MCD_2PCSZ(strDoc), file.nReadByteLen, &nNonASCII ); + MCD_STR strDetectedEncoding = bIsUTF8? MCD_T("UTF-8"): MCD_T(""); + if ( nNonASCII && file.strEncoding != strDetectedEncoding ) // only need to convert non-ASCII + { + file.strEncoding = strDetectedEncoding; + ConvertEncoding convert( MCD_ENC, file.strEncoding, MCD_2PCSZ(strDoc), file.nFileTextLen ); + int nMBLen = x_ConvertEncoding( convert, NULL ); + nBufferSizeForGrow = nMBLen + nMBLen/100; // extra 1% + MCD_STR strConvDoc; + MCD_CHAR* pBuffer = MCD_GETBUFFER(strConvDoc,nBufferSizeForGrow); + x_ConvertEncoding( convert, (void*)pBuffer ); + MCD_RELEASEBUFFER( strConvDoc, pBuffer, nMBLen ); + strDoc = strConvDoc; + MCD_SPRINTF( MCD_SSZ(szReadInfo), MCD_T("to length %d "), nMBLen ); + if ( convert.nFailedChars ) + MCD_PSZCAT( szReadInfo, MCD_T("(chars lost in conversion!) ") ); + } + MCD_PSZCAT( szReadInfo, MCD_T("(used UTF-8 detection) ") ); + } + } +#endif // not WCHAR + } + if ( bSuccess ) + file.strIOResult += szReadInfo; + else + file.strIOResult = x_GetLastError(); + return bSuccess; +} + +bool CMarkup::x_Write( void* pBuffer, FilePos& file, const void* pConstBuffer /*=NULL*/ ) +{ + MCD_CHAR szWriteInfo[100] = {0}; + size_t nDocByteLength = file.nFileTextLen * file.nFileCharUnitSize; + if ( ! pConstBuffer ) + pConstBuffer = pBuffer; + unsigned short* pTempEndianBuffer = NULL; + if ( x_EndianSwapRequired(file.nDocFlags) ) + { + if ( ! pBuffer ) + { + pTempEndianBuffer = new unsigned short[file.nFileTextLen]; + memcpy( pTempEndianBuffer, pConstBuffer, file.nFileTextLen * 2 ); + pBuffer = pTempEndianBuffer; + pConstBuffer = pTempEndianBuffer; + } + x_EndianSwapUTF16( (unsigned short*)pBuffer, file.nFileTextLen ); + file.strIOResult += MCD_T("endian swap "); + } + bool bSuccess = ( fwrite( pConstBuffer, nDocByteLength, 1, file.fp ) == 1 ); + if ( pTempEndianBuffer ) + delete [] pTempEndianBuffer; + if ( bSuccess ) + { + MCD_SPRINTF( MCD_SSZ(szWriteInfo), MCD_T("length %d "), file.nFileTextLen ); + file.strIOResult += szWriteInfo; + } + else + file.strIOResult = x_GetLastError(); + return bSuccess; +} + +bool CMarkup::x_WriteText( const MCD_STR& strDoc, FilePos& file ) +{ + bool bSuccess = true; + MCD_CHAR szWriteInfo[100] = {0}; + MCD_STRCLEAR( file.strIOResult ); + MCD_PCSZ pDoc = MCD_2PCSZ(strDoc); + int nWriteStrLen = MCD_STRLENGTH(strDoc); + if ( ! nWriteStrLen ) + { + file.strIOResult = MCD_T("0 length "); + return bSuccess; + } + + if ( file.nDocFlags & (MDF_UTF16LEFILE | MDF_UTF16BEFILE) ) + { +#if defined(MARKUP_WCHAR) // WCHAR +#if MARKUP_SIZEOFWCHAR == 4 // sizeof(wchar_t) == 4 + ConvertEncoding convert( MCD_T("UTF-16"), MCD_T("UTF-32"), (const void*)pDoc, nWriteStrLen ); + file.nFileTextLen = x_ConvertEncoding( convert, NULL ); + unsigned short* pUTF16Buffer = new unsigned short[file.nFileTextLen]; + x_ConvertEncoding( convert, (void*)pUTF16Buffer ); + MCD_SPRINTF( MCD_SSZ(szWriteInfo), MCD_T("from wide-32 length %d "), nWriteStrLen ); + bSuccess = x_Write( pUTF16Buffer, file ); + delete [] pUTF16Buffer; +#else // sizeof(wchar_t) == 2 + file.nFileTextLen = nWriteStrLen; + bSuccess = x_Write( NULL, file, pDoc ); +#endif +#else // not WCHAR + ConvertEncoding convert( MCD_T("UTF-16"), MCD_ENC, (const void*)pDoc, nWriteStrLen ); + file.nFileTextLen = x_ConvertEncoding( convert, NULL ); + unsigned short* pUTF16Buffer = new unsigned short[file.nFileTextLen]; + x_ConvertEncoding( convert, (void*)pUTF16Buffer ); + MCD_SPRINTF( MCD_SSZ(szWriteInfo), MCD_T("from byte length %d "), nWriteStrLen ); + bSuccess = x_Write( pUTF16Buffer, file ); + delete [] pUTF16Buffer; +#endif // not WCHAR + } + else // single or multibyte file (i.e. not UTF-16) + { +#if defined(MARKUP_WCHAR) // WCHAR + ConvertEncoding convert( file.strEncoding, MCD_ENC, (const void*)pDoc, nWriteStrLen ); + file.nFileTextLen = x_ConvertEncoding( convert, NULL ); + char* pMBBuffer = new char[file.nFileTextLen]; + x_ConvertEncoding( convert, (void*)pMBBuffer ); + MCD_SPRINTF( MCD_SSZ(szWriteInfo), MCD_T("from wide length %d "), nWriteStrLen ); + if ( convert.nFailedChars ) + MCD_PSZCAT( szWriteInfo, MCD_T("(chars lost in conversion!) ") ); + bSuccess = x_Write( pMBBuffer, file ); + delete [] pMBBuffer; +#else // not WCHAR + if ( x_CanConvert(file.strEncoding,MCD_ENC) ) + { + ConvertEncoding convert( file.strEncoding, MCD_ENC, (const void*)pDoc, nWriteStrLen ); + file.nFileTextLen = x_ConvertEncoding( convert, NULL ); + char* pMBBuffer = new char[file.nFileTextLen]; + x_ConvertEncoding( convert, (void*)pMBBuffer ); + MCD_SPRINTF( MCD_SSZ(szWriteInfo), MCD_T("from length %d "), nWriteStrLen ); + if ( convert.nFailedChars ) + MCD_PSZCAT( szWriteInfo, MCD_T("(chars lost in conversion!) ") ); + bSuccess = x_Write( pMBBuffer, file ); + delete [] pMBBuffer; + } + else // save directly from string + { + file.nFileTextLen = nWriteStrLen; + bSuccess = x_Write( NULL, file, pDoc ); + } +#endif // not WCHAR + } + + if ( bSuccess ) + file.strIOResult += szWriteInfo; + else + file.strIOResult = x_GetLastError(); + return bSuccess; +} + +bool CMarkup::x_Close( FilePos& file ) +{ + if ( file.fp ) + { + fclose( file.fp ); + file.fp = NULL; + file.nDocFlags &= ~(MDF_WRITEFILE|MDF_READFILE|MDF_APPENDFILE); + return true; + } + return false; +} + diff --git a/src/layer.cpp b/src/layer.cpp new file mode 100644 index 0000000..451a9f1 --- /dev/null +++ b/src/layer.cpp @@ -0,0 +1,102 @@ +/************************************************************************************************** + * LibNeural++ v.0.2 - All-purpose library for managing neural networks * + * Copyright (C) 2009, BlackLight * + * * + * This program is free software: you can redistribute it and/or modify it under the terms of the * + * GNU General Public License as published by the Free Software Foundation, either version 3 of * + * the License, or (at your option) any later version. This program is distributed in the hope * + * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * + * more details. You should have received a copy of the GNU General Public License along with * + * this program. If not, see . * + **************************************************************************************************/ + +#include "neural++.h" +using namespace neuralpp; + +/** + * @brief Constructor + * @param sz Size of the layer + * @param a Activation function + * @param d Its derivate + */ +Layer::Layer (size_t sz, float(*a)(float), float(*d)(float)) { + for (size_t i=0; i &el, float (*a)(float), float(*d)(float)) { + elements=el; + actv_f=a; + deriv=d; +} + +/** + * @return Number of neurons in the layer + */ +size_t Layer::size() { return elements.size(); } + +/** + * @brief Redefinition for operator []. It gets the neuron at i + */ +Neuron& Layer::operator[] (size_t i) { return elements[i]; } + +/** + * @brief It links a layer to another + * @param l Layer to connect to the current as input layer + */ +void Layer::link (Layer& l) { + srand ((unsigned) time(NULL)); + + for (size_t i=0; ipush_out(s); + n2->push_in(s); + } + } +} + +/** + * @brief It sets a vector of propagation values to all its neurons + * @param v Vector of values to write as propagation values + */ +void Layer::setProp (vector &v) { + for (size_t i=0; i &v) { + for (size_t i=0; isetProp(n->propagate()); + n->setActv( actv_f(n->getProp()) ); + } +} + diff --git a/src/neural_doc.h b/src/neural_doc.h new file mode 100644 index 0000000..776910e --- /dev/null +++ b/src/neural_doc.h @@ -0,0 +1,194 @@ +/************************************************************************************************** + * LibNeural++ v.0.2 - All-purpose library for managing neural networks * + * Copyright (C) 2009, BlackLight * + * * + * This program is free software: you can redistribute it and/or modify it under the terms of the * + * GNU General Public License as published by the Free Software Foundation, either version 3 of * + * the License, or (at your option) any later version. This program is distributed in the hope * + * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * + * more details. You should have received a copy of the GNU General Public License along with * + * this program. If not, see . * + **************************************************************************************************/ + +#ifndef __NEURALPP +#define __NEURALPP + +#include +#include +#include +using namespace std; + +namespace neuralpp { + //! Default rand value: |sin(rand)|, always >= 0 and <= 1 + #define RAND ( (float) abs( sinf((float) rand()) ) ) + + class Synapsis; + class Neuron; + class Layer; + class NeuralNet; + class NetworkFileNotFoundException; + class InvalidXMLException; + + /** + * @class NetworkFileNotFoundException + * @brief Exception thrown when doing an attempt to load a network from an invalid file + */ + class NetworkFileNotFoundException : public exception { + public: + NetworkFileNotFoundException() {} + const char* what() const throw() { return strdup("Attempt to load a neural network from an invalid network file\n"); } + }; + + /** + * @class InvalidXMLException + * @brief Exception thrown when trying parsing an invalid XML + */ + class InvalidXMLException : public exception { + public: + InvalidXMLException() {} + const char* what() const throw() { return strdup("Attempt to load an invalid XML file\n"); } + }; + + /** + * @class NeuralNet + * @brief Main project's class. Use *ONLY* this class, unless you know what you're doing + */ + class NeuralNet { + int epochs; + float l_rate; + float ex; + + Layer* input; + Layer* hidden; + Layer* output; + + void updateWeights(); + void commitChanges (Layer *l); + float error(float); + + float (*actv_f)(float); + float (*deriv)(float); + + public: + /** + * @brief Enum to choose the eventual training source for our network (XML from a file or from a string) + */ + typedef enum { file, str } source; + + NeuralNet (size_t, size_t, size_t, float, int); + NeuralNet (size_t, size_t, size_t, float(*)(float), float(*)(float), float, int); + NeuralNet (const char*) throw(); + + float getOutput(); + float expected(); + + vector getVectorOutput(); + + void setExpected(float); + void update(); + void propagate(); + void setInput (vector&); + void link(); + bool save (const char*); + void train(string, source) throw(); + + static vector split (char, string); + static void initXML (string&); + static string XMLFromSet (int, string); + static void closeXML(string&); + }; + + /** + * @class Synapsis + * @brief Class for managing synapsis. Don't use this class directly unless you know what + * you're doing, use NeuralNet instead + */ + class Synapsis { + float delta; + float weight; + + Neuron *in; + Neuron *out; + + float (*actv_f)(float); + float (*deriv)(float); + + public: + Synapsis (Neuron* i, Neuron* o, float(*)(float), float(*)(float)); + Synapsis (Neuron* i, Neuron* o, float w, float(*)(float), float(*)(float)); + + Neuron* getIn(); + Neuron* getOut(); + + void setWeight(float); + void setDelta(float); + + float getWeight(); + float getDelta(); + }; + + /** + * @class Neuron + * @brief Class for managing neurons. Don't use this class directly unless you know what + * you're doing, use NeuralNet instead + */ + class Neuron { + float actv_val; + float prop_val; + + vector< Synapsis > in; + vector< Synapsis > out; + + float (*actv_f)(float); + float (*deriv)(float); + public: + Neuron (float (*)(float), float(*)(float)); + Neuron (vector< Synapsis >, vector< Synapsis >, float (*)(float), float(*)(float)); + + Synapsis& synIn (size_t i); + Synapsis& synOut (size_t i); + + void push_in (Synapsis&); + void push_out (Synapsis&); + + void setActv (float); + void setProp (float); + + float getActv(); + float getProp(); + float propagate(); + + + size_t nIn(); + size_t nOut(); + }; + + /** + * @class Layer + * @brief Class for managing layers of neurons. Don't use this class directly unless you know what + * you're doing, use NeuralNet instead + */ + class Layer { + vector< Neuron > elements; + void (*update_weights)(); + + float (*actv_f)(float); + float (*deriv)(float); + public: + Layer (size_t sz, float (*)(float), float(*)(float)); + Layer (vector< Neuron >&, float(*)(float), float(*)(float)); + + Neuron& operator[] (size_t); + + void link (Layer&); + void setProp (vector&); + void setActv (vector&); + void propagate(); + + size_t size(); + }; +} + +#endif + diff --git a/src/neuralnet.cpp b/src/neuralnet.cpp new file mode 100644 index 0000000..0cfce80 --- /dev/null +++ b/src/neuralnet.cpp @@ -0,0 +1,578 @@ +/************************************************************************************************** + * LibNeural++ v.0.2 - All-purpose library for managing neural networks * + * Copyright (C) 2009, BlackLight * + * * + * This program is free software: you can redistribute it and/or modify it under the terms of the * + * GNU General Public License as published by the Free Software Foundation, either version 3 of * + * the License, or (at your option) any later version. This program is distributed in the hope * + * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * + * more details. You should have received a copy of the GNU General Public License along with * + * this program. If not, see . * + **************************************************************************************************/ + +#include "neural++.h" +#include "Markup.h" +#include +using namespace neuralpp; + +/** + * @brief Built-in function. The default activation function: f(x)=x + */ +float __actv(float prop) { return prop; } + +/** + * @brief Default derivate for default activation function: f'(x)=1 + */ +float __deriv(float prop) { return 1; } + +/** + * @brief Constructor + * @param in_size Size of the input layer + * @param hidden_size Size of the hidden layer + * @param out_size Size of the output layer + * @param l learn rate (get it after doing some experiments, but generally try to + * keep its value quite low to be more accurate) + * @param e Epochs (cycles) to execute (the most you execute, the most the network + * can be accurate for its purpose) + */ +NeuralNet::NeuralNet (size_t in_size, size_t hidden_size, size_t out_size, float l, int e) { + epochs=e; + ref_epochs=epochs; + l_rate=l; + actv_f=__actv; + deriv=__deriv; + + input = new Layer(in_size, __actv, __deriv); + hidden = new Layer(hidden_size, __actv, __deriv); + output = new Layer(out_size, __actv, __deriv); + link(); +} + +/** + * @brief Constructor + * @param in_size Size of the input layer + * @param hidden_size Size of the hidden layer + * @param out_size Size of the output layer + * @param actv Activation function to use (default: f(x)=x) + * @param deriv Derivate for the activation function to use (default: f'(x)=1) + * @param l learn rate (get it after doing some experiments, but generally try to + * keep its value quite low to be more accurate) + * @param e Epochs (cycles) to execute (the most you execute, the most the network + * can be accurate for its purpose) + */ +NeuralNet::NeuralNet (size_t in_size, size_t hidden_size, size_t out_size, + float(*a)(float), float(*d)(float), float l, int e) { + epochs=e; + ref_epochs=epochs; + l_rate=l; + + actv_f=a; + deriv=d; + + input = new Layer(in_size,a,d); + hidden = new Layer(hidden_size,a,d); + output = new Layer(out_size,a,d); + link(); +} + +/** + * @brief It gets the output of the network (note: the layer output should contain + * an only neuron) + */ +float NeuralNet::getOutput() { return (*output)[0].getActv(); } + +/** + * @brief It gets the output of the network in case the output layer contains more neurons + */ +vector NeuralNet::getVectorOutput() { + vector v; + + for (size_t i=0; isize(); i++) + v.push_back( (*output)[i].getActv() ); + return v; +} + +/** + * @brief It get the error made on the expected result as |v-v'|/v + * @param Expected value + * @return Mean error + */ +float NeuralNet::error(float expected) { + return abs( (getOutput() - expected* + deriv(getOutput())) / (abs(expected)) ); +} + +/** + * @brief It propagates values through the network. Use this when you want to give + * an already trained network some new values the get to the output + */ +void NeuralNet::propagate() { + hidden->propagate(); + output->propagate(); +} + +/** + * @brief It sets the input for the network + * @param v Vector of floats, containing the values to give to your network + */ +void NeuralNet::setInput(vector& v) { + input->setProp(v); + input->setActv(v); +} + +/** + * @brief It links the layers of the network (input, hidden, output). Don't use unless + * you exactly know what you're doing, it is already called by the constructor + */ +void NeuralNet::link() { + hidden->link(*input); + output->link(*hidden); +} + +/** + * @brief It sets the value you expect from your network + */ +void NeuralNet::setExpected(float e) { ex=e; } + +/** + * @brief It gets the value expected. Of course you should specify this when you + * build your network by using setExpected. + */ +float NeuralNet::expected() { return ex; } + +/** + * @brief It updates the weights of the net's synapsis through back-propagation. + * In-class use only + */ +void NeuralNet::updateWeights() { + float out_delta; + + for (size_t i=0; isize(); i++) { + Neuron *n = &(*output)[i]; + + for (size_t j=0; jnIn(); j++) { + Synapsis *s = &(n->synIn(j)); + out_delta = s->getIn()->getActv() * error(ex) * l_rate; + s->setDelta(out_delta); + } + } + + for (size_t i=0; isize(); i++) { + Neuron *n = &(*hidden)[i]; + float d = deriv(n->getProp()) * n->synOut(0).getWeight() * out_delta; + + for (size_t j=0; jnIn(); j++) { + Synapsis *s = &(n->synIn(j)); + s->setDelta(l_rate * d * s->getIn()->getActv()); + } + } +} + +/** + * @brief It commits the changes made by updateWeights() to the layer l. + * In-class use only + * @param l Layer to commit the changes + */ +void NeuralNet::commitChanges (Layer *l) { + for (size_t i=0; isize(); i++) { + Neuron *n = &(*l)[i]; + + for (size_t j=0; jnIn(); j++) { + Synapsis *s = &(n->synIn(j)); + s->setWeight(s->getWeight() + s->getDelta()); + s->setDelta(0); + } + } +} + +/** + * @brief It updates through back-propagation the weights of the synapsis and + * computes again the output value for epochs times, calling back + * updateWeights and commitChanges functions + */ +void NeuralNet::update() { + while ((epochs--)>0) { + updateWeights(); + commitChanges(output); + commitChanges(hidden); + propagate(); + } +} + +/** + * @brief Save an already trained neural network to a binary file + * @param fname Name of the file to write + * @return true in case of success, false otherwise + */ +bool NeuralNet::save(const char *fname) { + FILE *fp; + struct netrecord record; + + if (!(fp=fopen(fname,"wb"))) + return false; + + record.input_size = input->size(); + record.hidden_size = hidden->size(); + record.output_size = output->size(); + + record.epochs = ref_epochs; + record.l_rate = l_rate; + record.ex = ex; + + if (fwrite (&record, sizeof(struct netrecord), 1, fp)<=0) + return false; + + // Saving neurons' state + for (unsigned int i=0; i < input->size(); i++) { + struct neuronrecord r; + r.prop = (*input)[i].getProp(); + r.actv = (*input)[i].getActv(); + fwrite (&r, sizeof(struct neuronrecord), 1, fp); + } + + for (unsigned int i=0; i < hidden->size(); i++) { + struct neuronrecord r; + r.prop = (*hidden)[i].getProp(); + r.actv = (*hidden)[i].getActv(); + fwrite (&r, sizeof(struct neuronrecord), 1, fp); + } + + for (unsigned int i=0; i < output->size(); i++) { + struct neuronrecord r; + r.prop = (*output)[i].getProp(); + r.actv = (*output)[i].getActv(); + fwrite (&r, sizeof(struct neuronrecord), 1, fp); + } + + // Saving synapsis' state + for (unsigned int i=0; i < input->size(); i++) { + int nout = (*input)[i].nOut(); + fwrite (&nout, sizeof(int), 1, fp); + + for (int j=0; j < nout; j++) { + struct synrecord r; + r.w = (*input)[i].synOut(j).getWeight(); + r.d = (*input)[i].synOut(j).getDelta(); + fwrite (&r, sizeof(struct synrecord), 1, fp); + } + } + + for (unsigned int i=0; i < output->size(); i++) { + int nin = (*output)[i].nIn(); + fwrite (&nin, sizeof(int), 1, fp); + + for (int j=0; j < nin; j++) { + struct synrecord r; + r.w = (*output)[i].synIn(j).getWeight(); + r.d = (*output)[i].synIn(j).getDelta(); + fwrite (&r, sizeof(struct synrecord), 1, fp); + } + } + + for (unsigned int i=0; i < hidden->size(); i++) { + int nin = (*hidden)[i].nIn(); + fwrite (&nin, sizeof(int), 1, fp); + + for (int j=0; j < nin; j++) { + struct synrecord r; + r.w = (*hidden)[i].synIn(j).getWeight(); + r.d = (*hidden)[i].synIn(j).getDelta(); + fwrite (&r, sizeof(struct synrecord), 1, fp); + } + } + + for (unsigned int i=0; i < hidden->size(); i++) { + int nout = (*hidden)[i].nOut(); + fwrite (&nout, sizeof(int), 1, fp); + + for (int j=0; j < nout; j++) { + struct synrecord r; + r.w = (*hidden)[i].synOut(j).getWeight(); + r.d = (*hidden)[i].synOut(j).getDelta(); + fwrite (&r, sizeof(struct synrecord), 1, fp); + } + } + + fclose(fp); + return true; +} + +/** + * @brief Constructs a neural network from a previously saved file + * (saved using 'save()' method) + * @param fname File name to load the network from + * @throw NetworkFileNotFoundException + */ +NeuralNet::NeuralNet (const char *fname) throw() { + struct netrecord record; + FILE *fp; + + if (!(fp=fopen(fname,"rb"))) + throw NetworkFileNotFoundException(); + + if (fread(&record, sizeof(struct netrecord), 1, fp)<=0) + throw NetworkFileNotFoundException(); + + *this = NeuralNet(record.input_size, record.hidden_size, record.output_size, record.l_rate, record.epochs); + + // Restore neurons + for (unsigned int i=0; i < input->size(); i++) { + struct neuronrecord r; + fread (&r, sizeof(struct neuronrecord), 1, fp); + + (*input)[i].setProp(r.prop); + (*input)[i].setActv(r.actv); + (*input)[i].synClear(); + } + + for (unsigned int i=0; i < hidden->size(); i++) { + struct neuronrecord r; + fread (&r, sizeof(struct neuronrecord), 1, fp); + + (*hidden)[i].setProp(r.prop); + (*hidden)[i].setActv(r.actv); + (*hidden)[i].synClear(); + } + + for (unsigned int i=0; i < output->size(); i++) { + struct neuronrecord r; + fread (&r, sizeof(struct neuronrecord), 1, fp); + + (*output)[i].setProp(r.prop); + (*output)[i].setActv(r.actv); + (*output)[i].synClear(); + } + + for (unsigned int i=0; i < input->size(); i++) + (*input)[i].synClear(); + + for (unsigned int i=0; i < hidden->size(); i++) + (*hidden)[i].synClear(); + + for (unsigned int i=0; i < output->size(); i++) + (*output)[i].synClear(); + + hidden->link(*input); + output->link(*hidden); + + // Restore synapsis + for (unsigned int i=0; i < input->size(); i++) { + int nout; + fread (&nout, sizeof(int), 1, fp); + + for (int j=0; j < nout; j++) { + struct synrecord r; + fread (&r, sizeof(struct synrecord), 1, fp); + + (*input)[i].synOut(j).setWeight(r.w); + (*input)[i].synOut(j).setDelta(r.d); + } + } + + for (unsigned int i=0; i < output->size(); i++) { + int nin; + fread (&nin, sizeof(int), 1, fp); + + for (int j=0; j < nin; j++) { + struct synrecord r; + fread (&r, sizeof(struct synrecord), 1, fp); + + (*output)[i].synIn(j).setWeight(r.w); + (*output)[i].synIn(j).setDelta(r.d); + } + } + + for (unsigned int i=0; i < hidden->size(); i++) { + int nin; + fread (&nin, sizeof(int), 1, fp); + + for (int j=0; j < nin; j++) { + struct synrecord r; + fread (&r, sizeof(struct synrecord), 1, fp); + + (*hidden)[i].synIn(j).setWeight(r.w); + (*hidden)[i].synIn(j).setDelta(r.d); + } + } + + for (unsigned int i=0; i < hidden->size(); i++) { + int nout; + fread (&nout, sizeof(int), 1, fp); + + for (int j=0; j < nout; j++) { + struct synrecord r; + fread (&r, sizeof(struct synrecord), 1, fp); + + (*hidden)[i].synOut(j).setWeight(r.w); + (*hidden)[i].synOut(j).setDelta(r.d); + } + } + + fclose(fp); +} + +/** + * @brief Train a network using a training set loaded from an XML file. A sample XML file + * is available in examples/adder.xml + * @param xml XML file containing our training set + * @param src Source type from which the XML will be loaded (from a file [default] or from a string) + * @throw InvalidXMLException + */ +void NeuralNet::train (string xmlsrc, NeuralNet::source src = file) throw() { + float out; + CMarkup xml; + + if (src == file) + xml.Load(xmlsrc.c_str()); + else + xml.SetDoc(xmlsrc.c_str()); + + if (!xml.IsWellFormed()) { + throw InvalidXMLException(); + return; + } + + if (xml.FindElem("NETWORK")) { + while (xml.FindChildElem("TRAINING")) { + vector input; + float output; + bool valid = false; + + xml.IntoElem(); + + while (xml.FindChildElem("INPUT")) { + xml.IntoElem(); + input.push_back(atof(xml.GetData().c_str())); + xml.OutOfElem(); + } + + if (xml.FindChildElem("OUTPUT")) { + xml.IntoElem(); + output = atof(xml.GetData().c_str()); + xml.OutOfElem(); + } + + xml.OutOfElem(); + + while (!valid) { + char str[BUFSIZ]; + + setInput(input); + propagate(); + setExpected(output); + update(); + + out = getOutput(); + memset (str, 0x0, sizeof(str)); + snprintf (str, sizeof(str), "%f", out); + + if (!strstr(str, "inf")) + valid=true; + } + } + } + + return; +} + +/** + * @brief Initialize the training XML for the neural network + * @param xml String that will contain the XML + */ +void NeuralNet::initXML (string& xml) { + xml.append("\n" + "\n" + "\n\n" + "\n" + ); +} + +/** + * @brief Splits a string into a vector of floats, given a delimitator + * @param delim Delimitator + * @param str String to be splitted + * @return Vector of floats containing splitted values + */ +vector NeuralNet::split (char delim, string str) { + char tmp[1024]; + vector v; + memset (tmp, 0x0, sizeof(tmp)); + + for (unsigned int i=0, j=0; i <= str.length(); i++) { + if (str[i] == delim || i == str.length()) { + v.push_back(atof(tmp)); + memset (tmp, 0x0, sizeof(tmp)); + j=0; + } else + tmp[j++] = str[i]; + } + + return v; +} + +/** + * @brief Get a training set from a string and copies it to an XML + * For example, these strings could be training sets for making sums: + * "2,3;5" - "5,6;11" - "2,2;4" - "4,5:9" + * This method called on the first string will return an XML such this: + * '<training id="0"><input id="0">2</input><input id="1">3</input><output id="0">5</output> + * </training>' + * + * @param id ID for the given training set (0,1,..,n) + * @param set String containing input values and expected outputs + * @return XML string + */ +string NeuralNet::XMLFromSet (int id, string set) { + string xml; + vector in, out; + unsigned int delimPos = -1; + char delim=';'; + char tmp[1024]; + + for (delimPos=0; delimPos < set.length() && set[delimPos] != delim; delimPos++); + + if (delimPos == set.length()) + return xml; + + string inStr = set.substr(0,delimPos); + string outStr = set.substr(delimPos+1, set.length()); + + in = split(',', inStr); + out = split(',', outStr); + + snprintf (tmp, sizeof(tmp), "%d", id); + xml += "\t\n"; + + for (unsigned int i=0; i < in.size(); i++) { + memset (tmp, 0x0, sizeof(tmp)); + snprintf (tmp, sizeof(tmp), "%d", i); + xml += "\t\t"; + + memset (tmp, 0x0, sizeof(tmp)); + snprintf (tmp, sizeof(tmp), "%f", in[i]); + xml += string(tmp) + "\n"; + } + + for (unsigned int i=0; i < out.size(); i++) { + memset (tmp, 0x0, sizeof(tmp)); + snprintf (tmp, sizeof(tmp), "%d", i); + xml += "\t\t"; + + memset (tmp, 0x0, sizeof(tmp)); + snprintf (tmp, sizeof(tmp), "%f", out[i]); + xml += string(tmp) + "\n"; + } + + xml += "\t\n\n"; + return xml; +} + +/** + * @brief Closes an open XML document generated by "initXML" and "XMLFromSet" + * @param XML string to close + */ +void NeuralNet::closeXML(string &xml) { + xml.append("\n\n"); +} + diff --git a/src/neuron.cpp b/src/neuron.cpp new file mode 100644 index 0000000..037ca5f --- /dev/null +++ b/src/neuron.cpp @@ -0,0 +1,98 @@ +/************************************************************************************************** + * LibNeural++ v.0.2 - All-purpose library for managing neural networks * + * Copyright (C) 2009, BlackLight * + * * + * This program is free software: you can redistribute it and/or modify it under the terms of the * + * GNU General Public License as published by the Free Software Foundation, either version 3 of * + * the License, or (at your option) any later version. This program is distributed in the hope * + * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * + * more details. You should have received a copy of the GNU General Public License along with * + * this program. If not, see . * + **************************************************************************************************/ + +#include "neural++.h" +using namespace neuralpp; + +/** + * @brief Constructor + * @param a Activation function + * @param d Its derivate + */ +Neuron::Neuron (float (*a)(float), float (*d)(float)) { + actv_f=a; + deriv=d; +} + +/** + * @brief Alternative constructor, that gets also the synapsis linked to the neuron + */ +Neuron::Neuron (vector< Synapsis > i, vector< Synapsis > o, float (*a)(float), float(*d)(float)) { + in=i; + out=o; + + actv_f=a; + deriv=d; +} + +/** + * @brief Gets the i-th synapsis connected on the input of the neuron + */ +Synapsis& Neuron::synIn (size_t i) { return in[i]; } + +/** + * @brief Gets the i-th synapsis connected on the output of the neuron + */ +Synapsis& Neuron::synOut (size_t i) { return out[i]; } + +/** + * @brief It pushes a new input synapsis + */ +void Neuron::push_in (Synapsis& s) { in.push_back(s); } + +/** + * @brief It pushes a new output synapsis + */ +void Neuron::push_out (Synapsis& s) { out.push_back(s); } + +/** + * @brief Change the propagation value of the neuron + */ +void Neuron::setProp (float val) { prop_val=val; } + +/** + * @brief Change the activation value of the neuron + */ +void Neuron::setActv (float val) { actv_val=actv_f(val); } + +/** + * @return Number of input synapsis + */ +size_t Neuron::nIn() { return in.size(); } + +/** + * @return Number of output synapsis + */ +size_t Neuron::nOut() { return out.size(); } + +/** + * @brief It gets the propagation value of the neuron + */ +float Neuron::getProp() { return prop_val; } + +/** + * @brief It gets the activation value of the neuron + */ +float Neuron::getActv() { return actv_val; } + +/** + * @brief Propagate a neuron's activation value to the connected neurons + */ +float Neuron::propagate() { + float aux=0; + + for (size_t i=0; iactv_val); + return aux; +} + diff --git a/src/synapsis.cpp b/src/synapsis.cpp new file mode 100644 index 0000000..cefd1ce --- /dev/null +++ b/src/synapsis.cpp @@ -0,0 +1,84 @@ +/************************************************************************************************** + * LibNeural++ v.0.2 - All-purpose library for managing neural networks * + * Copyright (C) 2009, BlackLight * + * * + * This program is free software: you can redistribute it and/or modify it under the terms of the * + * GNU General Public License as published by the Free Software Foundation, either version 3 of * + * the License, or (at your option) any later version. This program is distributed in the hope * + * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * + * more details. You should have received a copy of the GNU General Public License along with * + * this program. If not, see . * + **************************************************************************************************/ + +#include "neural++.h" +using namespace neuralpp; + +/** + * @brief Constructor + * @param i Input neuron + * @param o Output neuron + * @param a Activation function + * @param d Derivate for activation function + */ +Synapsis::Synapsis (Neuron* i, Neuron* o, float(*a)(float), float(*d)(float)) { + srand((unsigned) time(NULL)); + + delta=0; + weight=RAND; + in=i; + out=o; + + actv_f=a; + deriv=d; +} + +/** + * @brief Constructor + * @param i Input neuron + * @param o Output neuron + * @param w Weight for the synapsis (default: random) + * @param a Activation function + * @param d Derivate for activation function + */ +Synapsis::Synapsis (Neuron* i, Neuron* o, float w, float(*a)(float), float(*d)(float)) { + delta=0; + weight=w; + in=i; + out=o; + + actv_f=a; + deriv=d; +} + +/** + * @return Reference to input neuron of the synapsis + */ +Neuron* Synapsis::getIn() { return in; } + +/** + * @return Reference to output neuron of the synapsis + */ +Neuron* Synapsis::getOut() { return out; } + +/** + * @return Weight of the synapsis + */ +float Synapsis::getWeight() { return weight; } + +/** + * @return Delta of the synapsis + */ +float Synapsis::getDelta() { return delta; } + +/** + * @brief It sets the weight of the synapsis + */ +void Synapsis::setWeight(float w) { weight=w; } + +/** + * @brief It sets the delta (how much to change the weight after an update) + * of the synapsis + */ +void Synapsis::setDelta(float d) { delta=d; } +