Home > Products > SophiaFramework UNIVERSE > Tutorial > Amazon E-Commerce > - 7 / 9 -

Parsing XML Data Output from Amazon ECS using DOM and SAX

When you click the "Go!" button in Search Input Window or click on a product in the List of Product Window, a request will be sent to ECS, and ECS will return data as an XML-formatted data stream to the application. The data returned by ECS depends on what has been input into the Amazon catalog, for instance, an XML document containing a list of items with limited information will be returned for the request sent by clicking on the "Go!" button in Search Input Window, and an XML document containing detailed information of a product will be returned for the request sent by clicking on a product in the List of Product Window.

The parsed information of each product will be stored in the AWSProduct data structure. The AWSListProducts data structure is for storing a list of products.

Parsing an XML document using DOM parser is rather simple. In this section we discuss mainly on how to effectively implement Amazon application using SAX parser.

Parsing XML Document using DOM parser

The source code below explains how to use the SFXXMLDOMParser class to parse an XML document. The DOM parser expands the XML document into memory as a tree. The application processes the nodes of the tree that it is interested in and calls the corresponding methods of AWSProduct class to get product information.

SFCError AWSListProducts::GetXMLProducts(SFXPathConstRef filename, SFXAnsiString producttype)
{
  SFCError error;
  SInt32 size;
  SInt32 i;

  SFXXMLDOMParser parser; 
  AWSProductPtr item;     

  parser.SetDoNamespaces(false);

  // for debugging
  SFXHelper::dbgprintf("[SGXAWSParser] [%s]", 
                       filename.AsFile().GetCString());

  // parse XML file using DOM parser
  // expand the XML document onto memory as a tree
  error = parser.Parse(filename); 
  SFXHelper::dbgprintf("[SGXAWSParser.Parse] [%d]", error);

  // if parsing failed
  if (error != SFERR_NO_ERROR)
  {
    _title = "Parsing failed";
    return error;
  }

  // get the document root
  SFXXMLDocumentPtr root = parser.GetDocument(); 
    
  // if no root found
  if (root==null)
  {
    error = SFERR_FAILED;
    _title = "Parsing failed";
    return error;
  }
    
  SFXXMLNodePtr _anode;

  // Get list of Items
  SFXList<SFXXMLNodePtr>* list = root->GetElementsByTagName("Item");
	
  // if no Item found
  if (list == null || list->GetSize() == 0)
  {
    error = SFERR_FAILED;
    _title = "Parsing failed";
    return error;
  }
    
  size = list->GetSize();
	
  for (i=0; i<size; i++)
  { // process each item
    _anode = list->Get(i); 
    // item now is the pointer to actual product class
    item = AWSProduct::Factory(producttype);
    item->SetParseImageUrl(true);
    item->SetParseCustomerReviews(true);
    item->SetParseSimilarProducts(true);
		
    // parse this node and store information in the item
    error = ParseItem(_anode, item);
		
    if (error == SFERR_NO_ERROR)
    {
      // put the item to the product list
      error = _productarray.Append(item);
      // if error then stop
      if (error != SFERR_NO_ERROR) break; 
	}	
		
  }
	
  return error;
}
SFCError AWSListProducts::ParseItem(SFXXMLNodePtr node, 
                                    AWSProductPtr item)
{
  SFCError error(SFERR_NO_ERROR);
	
  SFXAnsiString label;       // node label
  SFXXMLNodePtr child(node); // begin from first child

  //
  // process this node to get information then store in the item
  //
  for (child = child->GetFirstChild(); 
       child != null && error == SFERR_NO_ERROR; 
       child = child->GetNextSibling())
  {
    label = SFXAnsiString(child->GetNodeName());
		
    // get asin value
    if (label == "ASIN")
        item->SetAsin(item->DecodeHtml(child->GetText()));
    // get detail URL
    else if (label == "DetailPageURL")
             item->SetDetailPageURL(item->DecodeHtml(
                                                child->GetText()));
    // get sail rank
    else if (label == "SalesRank")
             item->SetSalesRank(
                  child->GetText().Replace(',', "").AsSInt32());
    // get item attributes
    else if (label == "ItemAttributes")
    {
       if (item->IsParseItemAttributes()) 
           error = item->ParseItemAttributes(child->GetFirstChild());
       else 
           error = item->ParseTitle(child->GetFirstChild());
    }
    // get small image
    else if (label == "SmallImage")
    {
       if (item->IsParseImageUrl()) 
           error = item->ParseSmallImage(child->GetFirstChild());
    }    
    // get medium image
    else if (label == "MediumImage")
    {
       if (item->IsParseImageUrl())
           error = item->ParseMediumImage(child->GetFirstChild());
    }
    // get large image
    else if (label == "LargeImage")
    {
       if (item->IsParseImageUrl())
           error = item->ParseLargeImage(child->GetFirstChild());
    }     
    // get Offer Summary
    else if (label == "OfferSummary")
    {
       if (item->IsParseOfferSummary())
           error = item->ParseOfferSummary(child->GetFirstChild());
    }           
    // get Offer
    else if (label == "Offers")
    {
       if (item->IsParseOffers())
           error = item->ParseOffers(child->GetFirstChild());
    }       
    // get Customer Reviews
    else if (label == "CustomerReviews")
    {
       if (item->IsParseCustomerReviews())
           error = item->ParseCustomerReviews(child->GetFirstChild());
    }
    // get Similar Products
    else if (label == "SimilarProducts")
    {
       if (item->IsParseSimilarProducts()) 
           error = item->ParseSimilarProducts(child->GetFirstChild());
    }
    // get Browse Nodes
    else if (label == "BrowseNodes")
    {
       if (item->IsParseBrowserNodes()) 
           error = item->ParseBrowserNodes(child->GetFirstChild());
     }
    // get Listmania Lists
    else if (label == "ListmaniaLists")
    {
       if (item->IsParseListmaniaLists())
           error = item->ParseListmaniaLists(child->GetFirstChild());	
    }
  }
  return error;
}

Below is the implementation of the ParseMediumImage method of AWSProduct class.

SFCError AWSProduct::ParseMediumImage(SFXXMLNodePtr node)
{
 SFCError error(SFERR_NO_ERROR);

 if (node == null) return SFERR_FAILED;

 if (_imageUrl == null) _imageUrl = new ImageUrlRec;
 if (_imageUrl == null) return SFERR_NO_MEMORY;

 for (; node != null; node = node->GetNextSibling()) 
 {
  if (SFXAnsiString(node->GetNodeName()) == "URL") 
   _imageUrl->mediumImageURL = DecodeHtml(node->GetText());
  else if (SFXAnsiString(node->GetNodeName()) == "Height")
   _imageUrl->mediumImageHeight = node->GetText().AsSInt32();
  else if (SFXAnsiString(node->GetNodeName()) == "Width") 
   _imageUrl->mediumImageWidth = node->GetText().AsSInt32();
 }
 
 return error;
}

Parsing XML Document using SAX parser

The source code below explains how to use the SFXXMLSAXParser class to parse an XML document.

 // parse xml file using SAXParser
 AWSSAXParser     _parser(_searchindex);
 SFXXMLSAXParser  saxparser;
	
 // make the trace map to get list of products that contains only ASIN and Title
 _parser.SetRoot(NewTraceMap());

 // parse the xml file
 saxparser.SetDefaultHandler(&_parser);
 error = saxparser.Parse(filename);

 // display the list of products
 if ((::new ListProductsWindow(_parser.GetProducts(), _searchindex, SEARCHWINDOW)) == null)
  error = SFERR_NO_MEMORY;

The remaining of this section is for explaining how to implement Amazon application using SAX parser.

Amazon-SAX application tries to address the drawbacks of SAX method but still keep the runtime performance advantages. The keys are how to effectively keep trace of the location in the XML structure during parsing and how to coordinate the navigational aspect and data collection aspect.

The first issue can solved by using a stack data structure (SFXStack) to keep trace of where you are during parsing. The second issue is addressed in the TagTracer class and AWSSAXParser class.

Tag Tracer and Map of Tag Tracers

A tag tracer represents a position in a tag path from the root of an XML structure and is kept on the stack.

A tag tracer not only mark the position within the XML structure but also associate actions with the position.

Each tag tracer has one parent and zero-to-many children.

A map of tag tracers contains paths from the root of the XML structure to the location where we want to collect data. The tracer map represents the application processing logic; when parsing an XML document, only paths that exists in this map will be processed, otherwise it will be ignored. The tracer map is implemented using SFXHashmap data structure.

When parsing the XML document, if a path from the root to the current tagname is found, the tag tracer corresponding to this tag name will be placed on top of the stack, if the path is not found, the whole branch from this tagname will be skipped.

The TagTracer class is shown below.

class TagTracer {
SFMSEALCOPY(TagTracer)

private:
 // a hash map to keep child tag tracers, the parent tag tracer can trace to the child tag tracers in this map only 
 SFXHashmap<SFXAnsiString, TagTracerPtr> _traceMap;			
 Bool                _haveStartCommand; // have tag start command?
 Bool                _haveEndCommand;   // have tag end command?
 pt2AWSProductMember _startCommand;// command to be called at start tag
 pt2AWSProductMember _endCommand;  // command to be called at end tag

 //callback function
 typedef Void (*CallerProcedureSPP)(SFXAnsiString,  pt2AWSProductMember, VoidPtr);
 CallerProcedureSPP _spp;
public:
 // constructor
 TagTracer(Void);
 TagTracer(pt2AWSProductMember cm);
 TagTracer(pt2AWSProductMember startCm, pt2AWSProductMember endCm);
 // destructor
 virtual ~TagTracer(Void);

 Bool HaveStartCommand(Void){return _haveStartCommand;}
 Bool HaveEndCommand(Void){return _haveEndCommand;}

 pt2AWSProductMember GetStartCommand(Void){return _startCommand;}
 pt2AWSProductMember GetEndCommand(Void){return _endCommand;}

 // this methos is used to set up a map of tag tracers, 
 // when this method is called, the caller tagtracer is registered to trace to the tracer with this tagname
 Void Trace(SFXAnsiStringConstRef tagname, TagTracerPtr tracer);
 
 // this method is called when the start of an element is found
 // if the tagname is found in the hashmap it means this tagname will be traced to
 // the found tagtracer for this tagname will be pushed to the stack
 // otherwise, the entire path wil be skipped 
 Void OnStart(SFXAnsiStringConstRef tagname, TracerTypeStackPtr stack, CallerProcedureSPP caller_procedure, VoidPtr caller);
 
// this method is called when the end of an element is found
 // it get the top tag tracer (pop), and do the associated command
 Void OnEnd (TracerTypeStackPtr stack, SFXAnsiString data, CallerProcedureSPP caller_procedure, VoidPtr caller);
};

TagTracerSkip class

TagTracerSkip class inherits from TagTracer to present a skipping position in a tag path. If the path from the root to the current tagname is not found, a skip object will be placed on the stack to skip the whole branch rooted from this tagname.

The TagTracerSkip class is shown below.

class TagTracerSkip : public TagTracer{
SFMSEALCOPY(TagTracerSkip)

public:
 TagTracerSkip(Void);
 virtual ~TagTracerSkip(Void);
	
 Void OnStart(SFXAnsiStringConstRef tagname, TracerTypeStackPtr stack);
 Void OnEnd (TracerTypeStackPtr stack);
};

AWSSAXParser class

This is SAX Parser handler class for parsing using SAX method. The class has a stack to keep trace of the location within the XML structure. It delegates the event handling to the tag tracer class for StartElement and EndElement events. It also call the method of AWSProduct class corresponding to the action associated with each tag tracer.

class AWSSAXParser : public SFXXMLDefaultHandler{
SFMSEALCOPY(AWSSAXParser)

private:
 TracerTypeStackPtr _tracerStack; // stack to keep the tag tracers
 SFXAnsiString      _data;        // character data

private:
 SFXAnsiString      _productgroup;// Book or DVD
 AWSProductPtr      _product;     	
 AWSListProductsPtr _products;     

 pt2AWSProductMember _pt2Member;  // pointer to a method of AWSProduct class
public:
 // constructor
 AWSSAXParser(SFXAnsiStringConstRef productgroup);
 // destructor
 virtual ~AWSSAXParser(Void);
	
 // push the root of the trace map into the stack
 // all future tag tracing will follow the paths from this root in the trace map 
 void SetRoot(TagTracerPtr root){_tracerStack->Push(root);};

 // return list of products (parsing result)
 AWSListProductsPtr GetProducts(Void){return _products;}		

 // (callback function) entry point for DoCommand function
 static Void DoCommandEntry(SFXAnsiString param, pt2AWSProductMember method, VoidPtr reference); 

 // call the corresponding method of AWSProduct class
 void DoCommand(SFXAnsiString param, pt2AWSProductMember method);

public:
 virtual Void StartDocument(Void);
 virtual Void Characters(SFXAnsiStringConstRef string, BoolConst cdataSection = true);
 virtual Void StartElement(SFXAnsiStringConstRef uri, SFXAnsiStringConstRef localname, SFXAnsiStringConstRef qname, SFXXMLGrammar::XMLAttrListConstRef attrList);
 virtual Void EndElement(SFXAnsiStringConstRef uri = "", SFXAnsiStringConstRef localname = "", SFXAnsiStringConstRef qname = "");
 ...
}

How to collect data

As an example, we work with an XML document obtained from ECS with Operation = ItemSearch, ResponseGroup=Small, SearchIndex= Books, and Keywords= XML.

<ItemSearchResponse>
  ...
 <Items>
  <Item>
   ...
   <ASIN>0321430840</ASIN>
   <DetailPageURL>
http://www.amazon.com/gp/redirect.html%3FASIN=0321430840%26tag=ws%26lcode=xm2%26cID=2025%26ccmID=165953%26location=/o/ASIN/0321430840%253FSubscriptionId=1NX4MQFASHJ0QAAVD182
   </DetailPageURL>
   <ItemAttributes>
    <Author>Elizabeth Castro</Author>
    <Manufacturer>Peachpit Press</Manufacturer>
    <ProductGroup>Book</ProductGroup>
    <Title>
    HTML, XHTML, and CSS, Sixth Edition (Visual Quickstart Guide)
    </Title>
   </ItemAttributes>
  </Item>
  ...
 </ Items>
<ItemSearchResponse>

The table below shows the parse event sequence and the method of AWSProduct class to be called during parsing. In this example we collect ASIN data and Title data only.

Tag name

Parse Event (AWSSAXParser)

Method of AWSProduct to be called

 

StartDocument()

 

<ItemSearchResponse>

StartElement("ItemSearchResponse")

 

<Items>

StartElement("Items")

 

<Item>

StartElement("Item")

 

<ASIN>

StartElement("ASIN")

 

</ASIN>

EndElement("ASIN")

AWSProduct::SetASIN

< ItemAttributes >

StartElement("ItemAttributes")

 

<Title>

StartElement("Title")

 

</Title>

EndElement("Title")

AWSProduct::SetTitle

</ItemAttributes >

EndElement("ItemAttributes")

 

</Item>

EndElement("Item")

 

AWSProduct::SetASIN method is the action associated with the tag tracer for the tag name <ASIN> and is notified to be called by the EndElement("ASIN") event.

The ASIN tag tracer is created when making the tracer map and is pushed into the stack when the tag name <ASIN> found during parsing the XML document.

TagTracerPtr ASIN = new TagTracer(&AWSProduct::SetAsin);

The event EndElement("ASIN") of AWSSAXParser class delegates the event handling to the TagTracer::OnEnd method. OnEnd method pops the ASIN tag tracer from the stack, gets the associated action and notifies AWSSAXParser class to do the action.

Void TagTracer::OnEnd (TracerTypeStackPtr stack, SFXAnsiString data, CallerProcedureSPP caller_procedure, VoidPtr caller)
{
 // pop the tracer from the stack and do the associated command 
 TagTracerPtr tracer = stack->Pop();
 if (tracer->HaveEndCommand())
 {
  // notify AWSSAXParser
  _spp = caller_procedure;
  _spp(data, tracer->GetEndCommand(), caller);
 }
}

AWSSAXParser class calls the corresponding method of AWSProduct class which simply set a class data member to the parameter. The function pointer technique is used and is shown below.

Void AWSSAXParser::DoCommandEntry(SFXAnsiString param, pt2AWSProductMember method, VoidPtr reference)
{
 AWSSAXParserPtr(reference)->DoCommand(param, method);
}

Void AWSSAXParser::DoCommand(SFXAnsiString param, pt2AWSProductMember method)
{
 // set the method to call
 _pt2Member = method;
 // call the corresponding method of AWSProduct
 _product->*_pt2Member(param);
}

Create the map of tracers

The following tracer map is created for retrieving ASIN and Title.

 // First, create the root
 TagTracerPtr root = new TagTracer();
 
 // Create ItemSearchResponse tracer and make the path: root\ItemSearchResponse
 TagTracerPtr ItemSearchResponse = new TagTracer();
 root->Trace("ItemSearchResponse", ItemSearchResponse);

 // Create Items tracer, make the path: root\ItemSearchResponse\Items
 TagTracerPtr Items = new TagTracer();
 ItemSearchResponse->Trace("Items", Items);
 
 // Create Item tracer, make the path: 
 // root\ItemSearchResponse\Items\Item
 TagTracerPtr Item = new TagTracer();
 Items->Trace("Item", Item);

 // Create ASIN tracer, make the path:
 // root\ItemSearchResponse\Items\Item\ASIN
 TagTracerPtr ASIN = new TagTracer(&AWSProduct::SetAsin);
 Item->Trace("ASIN", ASIN);

 // Create ItemAttributes tracer, make the path:
 // root\ItemSearchResponse\Items\Item\ ItemAttributes
 TagTracerPtr ItemAttributes = new TagTracer();
 Item->Trace("ItemAttributes", ItemAttributes);

 // Create Title tracer, make the path:
 // root\ItemSearchResponse\Items\Item\ ItemAttributes\Title
 TagTracerPtr Title = new TagTracer(&AWSProduct::SetTitle);
 ItemAttributes->Trace("Title", Title);

To use the tracer map, push its root into the stack of AWSSAXParser class by using Set Root method.

Go back  1   2   3   4   5   6   7   8   9  Next page