VBS HTML Parser


Topics:

Overview
Enumerations
Data Structures
Functions
HTML Tag Handlers


Overview

The vbsHTML class is a base class used to parse html documents. It is supplied with the database library to extract data from Web pages. The vbsHTML class works through multiple inheritance. It includes functions to load and parse HTML files. HTML tags are handled through the use of virtual tag handlers. Derived classes are responsible for processing HTML tags and any associated attributes by overriding the appropriate tag handler.


Enumerations

// The following list of HTML tags is a combination of HTML
// 2.0, 3.0, 3.2 tags supported by Netscape's Navigator
// web browser, Microsoft's Internet Explorer web browser, 
// and standards defined by the World Wide Web Consortium.
// This list was taken from the Willcam's Comprehensive HTML 
// Cross Reference at: 
// http://www.willcam.com/cmat/html/crossref.html 
enum { // HTML tags and modifiers ID enumeration
  vbsHTML::vbs_invalid_tag = 0, // Invalid tag specified
  vbsHTML::vbs_unknown_tag,     // Unknown tag specified
  vbsHTML::vbs_special_tag,     // Unknown special tags starting with 
                                // an ampersand ending in a semicolon
                                // &xxxx;

    // Tags and format specifiers with special meaning
  vbsHTML::vbs_comment_tag,     // comment
  vbsHTML::vbs_less_then,       // Less than sign "<" 
  vbsHTML::vbs_greater_then,    // Greater then sign ">"
  vbsHTML::vbs_ampersand,       // Ampersand "&"
  vbsHTML::vbs_nb_space,        // Non-breaking space " "
  vbsHTML::vbs_quote,           // Quotation mark """
  vbsHTML::vbs_ex_acsii_set,    // Extended ASCII character set

    // HTML tag codes
  vbsHTML::vbs_a_tag,           // anchor
  vbsHTML::vbs_abbrev_tag,      // abbreviation
  vbsHTML::vbs_acronym_tag,     // acronym
  vbsHTML::vbs_address_tag,     // address
  vbsHTML::vbs_applet_tag,      // java applet
  vbsHTML::vbs_area_tag,        // area
  vbsHTML::vbs_au_tag,          // author
  vbsHTML::vbs_author_tag,      // author
  vbsHTML::vbs_b_tag,           // bold
  vbsHTML::vbs_banner_tag,      // banner
  vbsHTML::vbs_base_tag,        // base
  vbsHTML::vbs_basefont_tag,    // base font
  vbsHTML::vbs_bgsound_tag,     // background sound
  vbsHTML::vbs_big_tag,         // big text
  vbsHTML::vbs_blink_tag,       // blink
  vbsHTML::vbs_blockquote_tag,  // block quote
  vbsHTML::vbs_bq_tag,          // block quote
  vbsHTML::vbs_body_tag,        // body
  vbsHTML::vbs_br_tag,          // line break
  vbsHTML::vbs_caption_tag,     // caption
  vbsHTML::vbs_center_tag,      // center
  vbsHTML::vbs_cite_tag,        // citation
  vbsHTML::vbs_code_tag,        // code
  vbsHTML::vbs_col_tag,         // table column
  vbsHTML::vbs_colgroup_tag,    // table column group
  vbsHTML::vbs_credit_tag,      // credit
  vbsHTML::vbs_del_tag,         // deleted text
  vbsHTML::vbs_dfn_tag,         // definition
  vbsHTML::vbs_dir_tag,         // directory list
  vbsHTML::vbs_div_tag,         // division
  vbsHTML::vbs_dl_tag,          // definition list
  vbsHTML::vbs_dt_tag,          // definition term
  vbsHTML::vbs_dd_tag,          // definition definition
  vbsHTML::vbs_em_tag,          // emphasized
  vbsHTML::vbs_embed_tag,       // embed
  vbsHTML::vbs_fig_tag,         // figure
  vbsHTML::vbs_fn_tag,          // footnote
  vbsHTML::vbs_font_tag,        // font
  vbsHTML::vbs_form_tag,        // form
  vbsHTML::vbs_frame_tag,       // frame
  vbsHTML::vbs_frameset_tag,    // frame set
  vbsHTML::vbs_h1_tag,          // heading 1
  vbsHTML::vbs_h2_tag,          // heading 2
  vbsHTML::vbs_h3_tag,          // heading 3
  vbsHTML::vbs_h4_tag,          // heading 4
  vbsHTML::vbs_h5_tag,          // heading 5
  vbsHTML::vbs_h6_tag,          // heading 6
  vbsHTML::vbs_head_tag,        // head
  vbsHTML::vbs_hr_tag,          // horizontal rule
  vbsHTML::vbs_html_tag,        // html
  vbsHTML::vbs_i_tag,           // italic
  vbsHTML::vbs_iframe_tag,      // frame - floating
  vbsHTML::vbs_img_tag,         // inline image
  vbsHTML::vbs_input_tag,       // form input
  vbsHTML::vbs_ins_tag,         // inserted text
  vbsHTML::vbs_isindex_tag,     // is index
  vbsHTML::vbs_kbd_tag,         // keyboard
  vbsHTML::vbs_lang_tag,        // language
  vbsHTML::vbs_lh_tag,          // list heading
  vbsHTML::vbs_li_tag,          // list item
  vbsHTML::vbs_link_tag,        // link
  vbsHTML::vbs_listing_tag,     // listing
  vbsHTML::vbs_map_tag,         // map
  vbsHTML::vbs_marquee_tag,     // marquee
  vbsHTML::vbs_math_tag,        // math
  vbsHTML::vbs_menu_tag,        // menu list
  vbsHTML::vbs_meta_tag,        // meta
  vbsHTML::vbs_multicol_tag,    // multi column text
  vbsHTML::vbs_nobr_tag,        // no break
  vbsHTML::vbs_noframes_tag,    // no frames
  vbsHTML::vbs_note_tag,        // note
  vbsHTML::vbs_ol_tag,          // ordered list
  vbsHTML::vbs_overlay_tag,     // overlay
  vbsHTML::vbs_p_tag,           // paragraph
  vbsHTML::vbs_param_tag,       // parameters
  vbsHTML::vbs_person_tag,      // person
  vbsHTML::vbs_plaintext_tag,   // plain text
  vbsHTML::vbs_pre_tag,         // preformatted text
  vbsHTML::vbs_q_tag,           // quote
  vbsHTML::vbs_range_tag,       // range
  vbsHTML::vbs_samp_tag,        // sample
  vbsHTML::vbs_script_tag,      // script
  vbsHTML::vbs_select_tag,      // form select
  vbsHTML::vbs_small_tag,       // small text
  vbsHTML::vbs_spacer_tag,      // white space
  vbsHTML::vbs_spot_tag,        // spot
  vbsHTML::vbs_strike_tag,      // strikethrough
  vbsHTML::vbs_strong_tag,      // strong
  vbsHTML::vbs_sub_tag,         // subscript
  vbsHTML::vbs_sup_tag,         // superscript
  vbsHTML::vbs_tab_tag,         // horizontal tab
  vbsHTML::vbs_table_tag,       // table
  vbsHTML::vbs_tbody_tag,       // table body
  vbsHTML::vbs_td_tag,          // table data
  vbsHTML::vbs_textarea_tag,    // form text area
  vbsHTML::vbs_textflow_tag,    // java applet textflow
  vbsHTML::vbs_tfoot_tag,       // table footer
  vbsHTML::vbs_th_tag,          // table header
  vbsHTML::vbs_thead_tag,       // table head
  vbsHTML::vbs_title_tag,       // title
  vbsHTML::vbs_tr_tag,          // table row
  vbsHTML::vbs_tt_tag,          // teletype
  vbsHTML::vbs_u_tag,           // underlined
  vbsHTML::vbs_ul_tag,          // unordered list
  vbsHTML::vbs_var_tag,         // variable
  vbsHTML::vbs_wbr_tag,         // word break
  vbsHTML::vbs_xmp_tag          // example
};


Data Structures

Data structure used to store the file position of an html tag, the tag itself, its attributes and instructions.

struct vbsHTMLTagInfo
{
  // File information
  df_StreamPos start_tag; // This tag's starting position in the file
  df_StreamPos end_tag;   // This tag's ending position in the file
  unsigned tag_length;    // The complete length of this tag "< ---- >"
  
  // Tag information
  int tag_id;        // Numerical value used to identify supported tags
  vbString tag_info; // Complete tag from opening to closing bracket
  vbString tag;      // HTML tag
  vbString attr;     // HTML tag attributes 

  // Tag instructions
  int start_instruction; // True if start of tag instruction "<"
  int end_instruction;   // True if end of a tag instruction "/x>"
  int has_attributes;    // True if this tag has associated attributes
}; 


Functions

vbsHTML::vbsHTML()
vbsHTML::~vbsHTML()
vbsHTML::ClearTagList()
vbsHTML::CloseFile()
vbsHTML::CollectHTMLTags()
vbsHTML::Copy()
vbsHTML::GetTag()
vbsHTML::GetTagID()
vbsHTML::GetTagList()
vbsHTML::HandleHTMLTag()
vbsHTML::LoadHTMLFile()
vbsHTML::LoadMemoryBuffer()
vbsHTML::NumProcessed()
vbsHTML::NumTags()
vbsHTML::ParseHTMLTagInfo()
vbsHTML::ProcessHTMLTags()

vbsHTML::vbsHTML() - Default class constructor.

vbsHTML::vbsHTML(const vbsHTML &ob) - Class copy constructor.

virtual vbsHTML::~vbsHTML() - Class destructor.

void vbsHTML::ClearTagList() - Public member function used to clear the tag list.

void vbsHTML::CloseFile() - Public member function used to close the open HTML file after a load operation.

int vbsHTML::CollectHTMLTags() - Internal processing function used to collect all the HTML tags in a previously opened file. Returns a zero if no file errors occur or a non-zero corresponding to DiskFileB error code if an error occurs.

int vbsHTML::CollectHTMLTags(const MemoryBuffer &membuf) - Internal processing function used to collect all the HTML tags from a previously loaded MemoryBuffer object. Returns a zero if no errors occur or a non-zero to indicate a failure.

void vbsHTML::Copy(const vbsHTML &ob) - Internal processing function used to copy vbsHTML objects.

char *vbsHTML::GetTag(int tag_id) - Public member function that returns a null terminated string based on the value of the tag ID number. The "tag_id" variable must equal one of the integer constants defined in the tag ID enumeration.

int vbsHTML::GetTagID(const vbString &tag) - Public member function that returns a numerical value defined in the tag ID enumeration that represents the specified tag.

vbDLList *vbsHTML::GetTagList() - Public member function that returns a pointer to the tag list.

void vbsHTML::HandleHTMLTag(int tag_id) - Internal processing function used to execute the derived class version of a specific tag handler.

int vbsHTML::LoadHTMLFile(const char *fname) - Public member function used to open the specified HTML file and process all the tags collected from the file. Returns a zero if no disk file errors occur or a non-zero corresponding to DiskFileB error code if an error occurs.

int vbsHTML::LoadMemoryBuffer(const MemoryBuffer &membuf) - Public member function used to process all the tags stored in a MemoryBuffer object. Returns a zero if no errors occur or a non-zero to indicate a failure.

unsigned vbsHTML::NumProcessed() - Public member function used to retrieve the total number of tags processed.

unsigned vbsHTML::NumTags() - Public member function used to retrieve the total number of tags collected.

void vbsHTML::ParseHTMLTagInfo(vbsHTMLTagInfo &t) - Public member function used to parse the specific tag information based on the string contained in the vbsHTMLTagInfo::tag_info member.

int vbsHTML::ProcessHTMLTags() - Internal processing function used to read and process all the tags in a previously opened file. Returns a zero if no disk file errors occur or a non-zero corresponding to DiskFileB error code if an error occurs.

int vbsHTML::ProcessHTMLTags(const MemoryBuffer &membuf) - Internal processing function used to read and process all the tags in a MemoryBuffer object. Returns a zero if no errors occur or a non-zero to indicate a failure.


HTML Tag Handlers

Derived class interface used to process tags.

void vbsHTML::Handle_INVALID_Tag()
{
  // Override to handle INVALID tags
}

void vbsHTML::Handle_UNKNOWN_Tag()
{
  // Override to handle UNKNOWN tags
}

void vbsHTML::Handle_UNKNOWN_SPECIAL_Tag()
{
  // Override to handle unknown special tags starting 
  // with an ampersand ending in a semicolon &xxxx;
}

void vbsHTML::Handle_COMMENT_Tag()
{
  // Override to handle COMMENT tags
}

void vbsHTML::Handle_LESS_THEN_Tag()
{
  // Override to handle a less than sign "&lt;" 
}

void vbsHTML::Handle_GREATER_THEN_Tag() 
{
  // Override to handle a greater than sign "&gt;"
}

void vbsHTML::Handle_AMPERSAND_Tag()
{
  // Override to handle an ampersand "&amp;" 
}

void vbsHTML::Handle_NB_SPACE_Tag()
{
  // Override to handle a non-breaking space "&nbsp;"
}

void vbsHTML::Handle_QUOTE_Tag() 
{
  // Override to handle a quotation mark "&quot;" 
}

void vbsHTML::Handle_EX_ASCII_Tag()
{
  // Override to handle the extended ASCII character set "&#"
}

void vbsHTML::Handle_A_Tag()
{
  // Override to handle ANCHOR tags
}

void vbsHTML::Handle_ABBREV_Tag()
{
  // Override to handle ABBREVIATION tags
}

void vbsHTML::Handle_ACRONYM_Tag()
{
  // Override to handle ACRONYM tags
}

void vbsHTML::Handle_ADDRESS_Tag()
{
  // Override to handle ADDRESS tags
}

void vbsHTML::Handle_APPLET_Tag()
{
  // Override to handle JAVA APPLET tag
}

void vbsHTML::Handle_AREA_Tag()
{
  // Override to handle AREA tags
}

void vbsHTML::Handle_AU_Tag()
{
  // Override to handle AUTHOR tags
}

void vbsHTML::Handle_AUTHOR_Tag()
{
  // Override to handle AUTHOR tags
}

void vbsHTML::Handle_B_Tag()
{
  // Override to handle BOLD tags
}

void vbsHTML::Handle_BANNER_Tag()
{
  // Override to handle BANNER tags
}

void vbsHTML::Handle_BASE_Tag()
{
  // Override to handle BASE tags
}

void vbsHTML::Handle_BASEFONT_Tag()
{
  // Override to handle BASE FONT
}

void vbsHTML::Handle_BGSOUND_Tag()
{
  // Override to handle BACKGROUND SOUND
}

void vbsHTML::Handle_BIG_Tag()
{
  // Override to handle BIG text
}

void vbsHTML::Handle_BLINK_Tag()
{
  // Override to handle BLINK tags
}

void vbsHTML::Handle_BLOCKQUOTE_Tag()
{
  // Override to handle BLOCK QUOTE tags
}

void vbsHTML::Handle_BQ_Tag()
{
  // Override to handle BLOCK QUOTE tags
}

void vbsHTML::Handle_BODY_Tag()
{
  // Override to handle BODY tags
}

void vbsHTML::Handle_BR_Tag()
{
  // Override to handle LINE BREAK tags
}

void vbsHTML::Handle_CAPTION_Tag()
{
  // Override to handle CAPTION tags
}

void vbsHTML::Handle_CENTER_Tag()
{
  // Override to handle CENTER tags
}

void vbsHTML::Handle_CITE_Tag()
{
  // Override to handle CITATION tags
}

void vbsHTML::Handle_CODE_Tag()
{
  // Override to handle CODE tags
}

void vbsHTML::Handle_COL_Tag()
{
  // Override to handle TABLE Cols tags
}

void vbsHTML::Handle_COLGROUP_Tag()
{
  // Override to handle TABLE Cols tags
}

void vbsHTML::Handle_CREDIT_Tag()
{
  // Override to handle CREDIT tags
}

void vbsHTML::Handle_DEL_Tag()
{
  // Override to handle DELETED text tags
}

void vbsHTML::Handle_DFN_Tag()
{
  // Override to handle DEFINITION tags
}

void vbsHTML::Handle_DIR_Tag()
{
  // Override to handle DIRECTORY list tags
}

void vbsHTML::Handle_DIV_Tag()
{
  // Override to handle DIVISION tags
}

void vbsHTML::Handle_DL_Tag()
{
  // Override to handle DEFINITION list tags
}

void vbsHTML::Handle_DT_Tag()
{
  // Override to handle DEFINITION term tags
}

void vbsHTML::Handle_DD_Tag()
{
  // Override to handle DEFINITION tags
}

void vbsHTML::Handle_EM_Tag()
{
  // Override to handle EMPHASIZED tags
}

void vbsHTML::Handle_EMBED_Tag()
{
  // Override to handle EMBED tags
}

void vbsHTML::Handle_FIG_Tag()
{
  // Override to handle FIGURE tags
}

void vbsHTML::Handle_FN_Tag()
{
  // Override to handle FOOTNOTE tags
}

void vbsHTML::Handle_FONT_Tag()
{
  // Override to handle FONT tags
}

void vbsHTML::Handle_FORM_Tag()
{
  // Override to handle FORM tags
}

void vbsHTML::Handle_FRAME_Tag()
{
  // Override to handle FRAME tags
}

void vbsHTML::Handle_FRAMESET_Tag()
{
  // Override to handle FRAME sets
}

void vbsHTML::Handle_H1_Tag()
{
  // Override to handle HEADING 1 tags
}

void vbsHTML::Handle_H2_Tag()
{
  // Override to handle HEADING 2 tags
}

void vbsHTML::Handle_H3_Tag()
{
  // Override to handle HEADING 3 tags
}

void vbsHTML::Handle_H4_Tag()
{
  // Override to handle HEADING 4 tags
}

void vbsHTML::Handle_H5_Tag()
{
  // Override to handle HEADING 5 tags
}

void vbsHTML::Handle_H6_Tag()
{
  // Override to handle HEADING 6 tags
}

void vbsHTML::Handle_HEAD_Tag()
{
  // Override to handle HEAD tags
}

void vbsHTML::Handle_HR_Tag()
{
  // Override to handle HORIZONTAL rules
}

void vbsHTML::Handle_HTML_Tag()
{
  // Override to handle HTML tags
}

void vbsHTML::Handle_I_Tag()
{
  // Override to handle ITALIC tags
}

void vbsHTML::Handle_IFRAME_Tag()
{
  // Override to handle FRAME - Floating tag
}

void vbsHTML::Handle_IMG_Tag()
{
  // Override to handle INLINE images
}

void vbsHTML::Handle_INPUT_Tag()
{
  // Override to handle FORM input tags
}

void vbsHTML::Handle_INS_Tag()
{
  // Override to handle INSERTED text
}

void vbsHTML::Handle_ISINDEX_Tag()
{
  // Override to handle ISINDEX tag
}

void vbsHTML::Handle_KBD_Tag()
{
  // Override to handle KEYBOARD tags
}

void vbsHTML::Handle_LANG_Tag()
{
  // Override to handle LANGUAGE tags
}

void vbsHTML::Handle_LH_Tag()
{
  // Override to handle LIST header tags
}

void vbsHTML::Handle_LI_Tag()
{
  // Override to handle LIST item tags
}

void vbsHTML::Handle_LINK_Tag()
{
  // Override to handle LINK tags
}

void vbsHTML::Handle_LISTING_Tag()
{
  // Override to handle LISTING tags
}

void vbsHTML::Handle_MAP_Tag()
{
  // Override to handle MAP tags
}

void vbsHTML::Handle_MARQUEE_Tag()
{
  // Override to handle MARQUEE tags
}

void vbsHTML::Handle_MATH_Tag()
{
  // Override to handle MATH tags
}

void vbsHTML::Handle_MENU_Tag()
{
  // Override to handle MENU list tags
}

void vbsHTML::Handle_META_Tag()
{
  // Override to handle META tags
}

void vbsHTML::Handle_MULTICOL_Tag()
{
  // Override to handle MULTI COLUMN tags
}

void vbsHTML::Handle_NOBR_Tag()
{
  // Override to handle NO BREAK tags
}

void vbsHTML::Handle_NOFRAMES_Tag()
{
  // Override to handle NO FRAMES tags
}

void vbsHTML::Handle_NOTE_Tag()
{
  // Override to handle NOTE tags
}

void vbsHTML::Handle_OL_Tag()
{
  // Override to handle ORDERED list tags
}

void vbsHTML::Handle_OVERLAY_Tag()
{
  // Override to handle OVERLAY tags
}

void vbsHTML::Handle_P_Tag()
{
  // Override to handle PARAGRAPH tags
}

void vbsHTML::Handle_PARAM_Tag()
{
  // Override to handle PARAMETERS tags
}

void vbsHTML::Handle_PERSON_Tag()
{
  // Override to handle PERSON tags
}

void vbsHTML::Handle_PLAINTEXT_Tag()
{
  // Override to handle PLAIN text tags
}

void vbsHTML::Handle_PRE_Tag()
{
  // Override to handle PREFORMATTED text tags
}

void vbsHTML::Handle_Q_Tag()
{
  // Override to handle QUOTE tags
}

void vbsHTML::Handle_RANGE_Tag()
{
  // Override to handle RANGE tags
}

void vbsHTML::Handle_SAMP_Tag()
{
  // Override to handle SAMPLE tags
}

void vbsHTML::Handle_SCRIPT_Tag()
{
  // Override to handle SCRIPT tags
}

void vbsHTML::Handle_SELECT_Tag()
{
  // Override to handle FORM SELECT tags
}

void vbsHTML::Handle_SMALL_Tag()
{
  // Override to handle SMALL text tags
}

void vbsHTML::Handle_SPACER_Tag()
{
  // Override to handle WHITE SPACE tags
}

void vbsHTML::Handle_SPOT_Tag()
{
  // Override to handle SPOT tags
}

void vbsHTML::Handle_STRIKE_Tag()
{
  // Override to handle STRIKETHROUGH tags
}

void vbsHTML::Handle_STRONG_Tag()
{
  // Override to handle STRONG tags
}

void vbsHTML::Handle_SUB_Tag()
{
  // Override to handle SUBSCRIPT tags
}

void vbsHTML::Handle_SUP_Tag()
{
  // Override to handle SUPERSCRIPT tags
}

void vbsHTML::Handle_TAB_Tag()
{
  // Override to handle HORIZONTAL TABS tags
}

void vbsHTML::Handle_TABLE_Tag()
{
  // Override to handle TABLE tags
}

void vbsHTML::Handle_TBODY_Tag()
{
  // Override to handle TABLE body tags
}

void vbsHTML::Handle_TD_Tag()
{
  // Override to handle TABLE data tags
}

void vbsHTML::Handle_TEXTAREA_Tag()
{
  // Override to handle FORM form tags
}

void vbsHTML::Handle_TEXTFLOW_Tag()
{
  // Override to handle JAVA applet textflow
}

void vbsHTML::Handle_TFOOT_Tag()
{
  // Override to handle TABLE footer tags
}

void vbsHTML::Handle_TH_Tag()
{
  // Override to handle TABLE head
}

void vbsHTML::Handle_THEAD_Tag()
{
  // Override to handle TABLE head tag
}

void vbsHTML::Handle_TITLE_Tag()
{
  // Override to handle TITLE tags
}

void vbsHTML::Handle_TR_Tag()
{
  // Override to handle TABLE row tags 
}

void vbsHTML::Handle_TT_Tag()
{
  // Override to handle TELETYPE tags
}

void vbsHTML::Handle_U_Tag()
{
  // Override to handle UNDERLINED tags
}

void vbsHTML::Handle_UL_Tag()
{
  // Override to handle UNORDERED list tags
}

void vbsHTML::Handle_VAR_Tag()
{
  // Override to handle VARIABLE tags 
}

void vbsHTML::Handle_WBR_Tag()
{
  // Override to handle WORD BREAK tags
}

void vbsHTML::Handle_XMP_Tag()
{
  // Override to handle EXAMPLE tags
}


End Of Document