Don't think you can't: February 2011

Sunday, February 27, 2011

+ C++ Red Black Tree implementation

RedBlackTree.h
------------------------------------------------------------------------------------------------
#include <stdio.h>
#include <string.h>
#include <vector>

using namespace std;

/*
    Each child of the Red Black Tree
*/
class Node
{
public:
    char *data; /*data*/
    char color;    /*color*/
    Node *left, *right, *parent; /*lelf, right, parent*/

    /*copy constructor*/
    Node(char *data)
    {
        this->data = new char[strlen(data) + 1];
        strcpy(this->data, data);
        color = NULL;
    }

    /*constructor*/
    Node()
    {
        this->data = new char[1];
        color = NULL;
    }

    /*destructor*/
    ~Node()
    {
        delete (this->data);
        color = NULL;
    }
};

/*
    This is a class of Red Black Tre
*/
class RedBlackTree
{
private:
    Node    *Root;    /*root*/
    Node    *Nil;    /*leaf*/

public:
    /*constructor*/
    RedBlackTree            ();

    /*destructor*/
    ~RedBlackTree            ();

    Node*    tree_successor    ( Node* x );

            /*left rotate*/
    int        leftRotate        ( Node *x );

            /*right rotate*/
    int        rightRotate        ( Node *x );

            /*insert key*/
    int        insert_key        ( char *data );

            /*fix up*/
    int        fixup            ( Node *z );

            /*delete key*/
    int        delete_key        ( Node *z );

            /*delete fixup*/
    int        delete_fixup    ( Node *x );

            /*in order traverse*/
    void    inOrderTraverse    ( Node *n );

            /*post order delete*/
    void    postOrderDelete    ( Node *n );
};
RedBlackTree.cpp
------------------------------------------------------------------------------------------
#include "RedBlackTree.h"

/**
*    tree successor
*    @param: x is Node type data
*    @return: return y is a another Node type pointer
*/
Node* RedBlackTree::tree_successor( Node* x )
{
    if ( x->right != Nil )
    {
        x = x->right;

        while ( x->left != Nil )
        {
            x = x->left;
        }

        return x;
    }

    Node* y = x->parent;

    while ( y != Nil && x == y->right )
    {
        x = y;
        y = y->parent;
    }

    return y;
}

/**
*    left rotate
*    @param: x is a node type data
*    @return: 0 by default
*/
int RedBlackTree::leftRotate( Node *x )
{
    Node *y;
    y = x->right;
    x->right = y->left;
    y->left->parent = x;
    y->parent = x->parent;

    if ( x->parent == Nil )
        Root = y;
    else if ( x == x->parent->left )
        x->parent->left = y;
    else
        x->parent->right = y;
    y->left = x;
    x->parent = y;
    return 0;
}

/**
*    right rotate
*    @param: x is a Node type pointer
*    @return: 0 by default
*/
int RedBlackTree::rightRotate( Node *x )
{
    Node *y;

    y = x->left;
    x->left = y->right;
    y->right->parent = x;
    y->parent = x->parent;

    if ( x->parent == Nil )
        Root = y;
    else if ( x == x->parent->right )
        x->parent->right = y;
    else
        x->parent->left = y;
    y->right = x;
    x->parent = y;
    return 0;
}

/**
*    insert key
*    @param: data is a char type pointer
*    @return: 0 by default
*    @see: fixup()
*/
int RedBlackTree::insert_key( char *data )
{
    Node *x, *y, *z;

    y = Nil;
    x = Root;

    while ( x != Nil )
    {
        y = x;
        if ( strcmp( data, x->data ) < 0 )
            x = x->left;
        else if ( strcmp( data, x->data ) > 0 )
            x = x->right;
        else
            return -1;
    }

    z = new Node(data);
    z->parent = y;
    z->color = 'R';
    z->left = z->right = Nil;


    if ( y == Nil )
        Root = z;
    else
    {
        if ( strcmp( data, y->data ) <= 0 )
            y->left = z;
        else
            y->right = z;
    }

    fixup( z );

    x = y = NULL;
    return 0;
}

/**
*    fixup the node
*    @param:        z is a Node type pointer
*    @return:    0 by default
*    @see:        leftRotate(Node*)
*    @see:        rightRotate(Node*)
*/
int RedBlackTree::fixup( Node *z )
{
    Node *y;

    while ( z->parent->color == 'R' )
    {
        if ( z->parent == z->parent->parent->left )
        {
            y = z->parent->parent->right;

            if ( y->color == 'R' )
            {
                z->parent->color = 'B';
                y->color = 'B';
                z->parent->parent->color = 'R';
                z = z->parent->parent;
            }
            else
            {
                if ( z == z->parent->right )
                {
                    z = z->parent;
                    leftRotate( z );
                }

                z->parent->color = 'B';
                z->parent->parent->color = 'R';
                rightRotate( z->parent->parent );
            }
        }
        else
        {
            y = z->parent->parent->left;

            if ( y->color == 'R' )
            {
                z->parent->color = 'B';
                y->color = 'B';
                z->parent->parent->color = 'R';
                z = z->parent->parent;
            }
            else
            {
                if ( z == z->parent->left )
                {
                    z = z->parent;
                    rightRotate( z );
                }
                z->parent->color = 'B';
                z->parent->parent->color = 'R';
                leftRotate( z->parent->parent );
            }
        }
    }

    Root->color = 'B';

    y = NULL;
    return 0;
}

/**
*    delete key
*    @param:    z is a Node type param
*    @see:    tree_successor(Node*)
*    @see:    delete_fixup(Node*)
*    @return:0 by default
*/
int RedBlackTree::delete_key( Node *z )
{
    Node *x, *y;

    if ( z->left == Nil || z->right == Nil )
    {
        y = z;
    }
    else
    {
        y = this->tree_successor( z );
    }

    if ( y->left != Nil )
    {
        x = y->left;
    }
    else
    {
        x = y->right;
    }

    x->parent = y->parent;

    if ( y->parent == Nil )
    {
        Root = x;
    }
    else
    {
        if ( y == y->parent->left )
        {
            y->parent->left = x;
        }
        else
        {
            y->parent->right = x;
        }
    }

    if ( y != z )
    {
        strcpy( y->data, z->data );
        //z->data = y->data;
    }

    if ( y->color == 'B' )
    {
        delete_fixup( x );
    }

    return 0;
}

/**
*    delete fixup
*    @param: x is a Node type param
*    @return: 0 by default
*    @see:    leftRotate(Node*)
*    @see:    rightRotate(Node*)
*/
int RedBlackTree::delete_fixup( Node *x )
{
    Node *w;

    while ( x != Root && x->color == 'B' )
    {
        if ( x == x->parent->left )
        {
            w = x->parent->right;

            if ( w->color == 'R' )
            {
                w->color = 'B';
                x->parent->color = 'R';
                leftRotate( x->parent );
                w = x->parent->right;
            }

            if ( w->left->color == 'B' && w->right->color == 'B' )
            {
                w->color = 'R';
                x = x->parent;
            }
            else if ( w->right->color == 'B' )
            {
                w->left->color = 'B';
                w->color = 'R';
                rightRotate( w );
                w = x->parent->right;
            }

            w->color = x->parent->color;
            x->parent->color = 'B';
            w->right->color = 'B';
            leftRotate( x->parent );
            x = Root;
        }
        else
        {
            w = x->parent->left;

            if ( w->color == 'R' )
            {
                w->color = 'B';
                x->parent->color = 'R';
                rightRotate( x->parent );
                w = x->parent->left;
            }

            if ( w->right->color == 'B' && w->left->color == 'B' )
            {
                w->color = 'R';
                x = x->parent;
            }
            else if ( w->left->color == 'B' )
            {
                w->right->color = 'B';
                w->color = 'R';
                leftRotate( w );
                w = x->parent->left;
            }

            w->color = x->parent->color;
            x->parent->color = 'B';
            w->left->color = 'B';
            rightRotate( x->parent );
            x = Root;
        }
    }

    x->color = 'B';

    return 0;
}

/**
*    in order traverse
*    @param:    n is a Node type param
*    @desc:    recursive calling
*/
void RedBlackTree::inOrderTraverse( Node *n )
{
    if ( n != Nil )
    {
        inOrderTraverse( n->left );
        puts( n->data );
        inOrderTraverse( n->right );
    }
}

/**
*    post order delete
*    @param: n is a Node type
*    @desc:    recursive calling
*/
void RedBlackTree::postOrderDelete( Node *n )
{
    if ( n != Nil )
    {
        postOrderDelete( n->left );
        postOrderDelete( n->right );
        delete( n );
    }
}

/**
*    Constructor
*    Nil equal Node
*    Root equal Nil
*/
RedBlackTree::RedBlackTree()
{
    this->Nil = new Node();
    this->Root = Nil;
}

/**
* destructor
* @see: postOrderDelete(Node*)
*/
RedBlackTree::~RedBlackTree()
{
    postOrderDelete( this->Root );
    delete (this->Nil);
}

Wednesday, February 23, 2011

+ Project work flow and description (Business Profile extraction) at the very first time.

BusinessSearch Project work flow and description

Objective of BusinessSearch Project:

Main goal of this project is to collect the business profile in a automated way and try to develop a system that can update the profile in quarter or half year basis.

Flow Description:

The target of our project is to collect maximum business profile. But the business profile is fully depends on host name. If we have a single host name then we are able to create a profile by using it’s home page, contact us, about us pages. But we do not have enough host names to extract Business profile for our project . That is why our main target is to collect maximum host name and convert it to a perfect business profile.

Current / available attributes for Business profile are:

· Fax

· Email

· Website

· Zip code

· Address

· Phone no

· GPS location

· Contract person

· Contact person’s designation

Upcoming/ future attributes are:

· Business type

· Business Description

· Owner/ Proprietor/ Director

· Establishment date

· Registration Date

· Logo

Challenges: WebCrawler

· Robustness

· Mirror

· Hashing

· Unexpected bug

Work done so far in Business Search:

· Fax : 80- 90% accuracy

· Phone: 80-90%

· Zip Code: 70-80%

· Website: 85-90%

· Email: 85-90%

· Address: 45-50%

· Company Name: 45-50%

· Contact Person: 50-60%

· Contact Designation: 60-70%

· GPS Cordinate: 60-70% but (we will consider it later with the support of iSearch/Lucene)

· Branch: 70-80%

Yet to Develop:

· Business type – (Machine Learning)

· Business Description – (Snippet)

· Owner/ Proprietor/ Director (Natural Language Processing)

· Establishment date - Parsing

· Registration Date – Parsing

· Logo – Paring + Tricks + Crawling

+ Business Profile extraction Process flow diagram

Here I'm giving you a Business Profile extraction process flow diagram. But this is not a flow diagram. Actually I'm giving you a basic idea of a modules name in our project.

+ Business Profile extraction from webpages or business directory report

Few days ago, we have tried to develop a business profile extraction parser. But if you want to write down a business profile parser then you have to do a proper R&D over that.

Right now I'm giving you a my first R&D report of a business profile parser.

Current R&D Target:

Developing an efficient intelligent profile parsing system that is able to parse profiles from at least 80% - 90% Business Directory WebPages.

Progress:

Our target Business-Directory-WebPages are classified into two groups:

1. Multiple-Profile Pages

2. Single-Profile Pages

Multiple Profile Pages:

- These WebPages contains Multiple Business Profiles on a single page and there are common patterns in the html structure of those profiles.

- Our intelligent parser would identify this repeated pattern and parse the profiles accordingly.

- There is a big possibility that, some invalid data would be parse as business profiles, but our profile validator will discard them.

Single Profile Pages:

- This case is a little harder than the previous case.

- But, the approach is almost similar to the previous one.

- In maximum business directories, different Single-Profile-Pages maintain the same pattern.

Currently we are focusing on the first case (Multiple-Profile Page). We are using selected pages from our crawled data.

Challenges:

The main challenge is to define and identify the pattern. Most of the html pages are semi-structured. So, there is no universal pattern that works for each one.

Here are some sample patterns:

Figure 1:

This is a part of the html, from a multiple-profile page. The highlighted (yellow) region contains muiltiple business profiles.

Figure 2:

In this html, each profile is contained in the <div class=”listing”> tag.

Figure 3:

This is a more complex pattern. Here, each of the 3 consecutive (highlighted) <tr> tags contains a single profile.

Figure 4:

The previous examples imply that, identifying repeated patterns allows us to parse multiple profiles. But, this figure shows a complex counter-example. Here, the pattern in the yellow region is repeated, but they don’t contain any business profiles. All the business profiles are contained in the <tr> tag (marked with a red rectangle). The next image is the expansion of this <tr> tag.

Figure 5:

This is the expansion of the <tr> tag (marked with red rectangle) from the previous example (Figure 4). Each of the table in the yellow region contains a business profile.

Figure 6:

This image shows the expansion of the <table> tags from the previous example (Figure 5).

Figure 7: This is the expansion of the 3 consecutive <tr> tags from the figure 3.

Approach:

After analyzing many business directory websites, we have found some possible techniques to identify the profile patterns.

Each business profile contains multiple entries (Company Name, Email, Address etc.). Each of these entries has a unique html tag-chain. Depending on these tag-chains we can identify the possible data segments in the following way:

- At first, we have to identify the chain that signifies the start of a profile segment. We call it, the main-chain.

- Initially, we considered the most frequent chain as the main-chain. But, this heuristic fails for some cases.

- So, we consider all the chains in the non-increasing order of frequency and check for the validity of the result. But, the validity checking has many hard challenges that are part of Natural Language Processing.

Example:

Here is an example of a main-chain for a webpage:

Most Frequent Chain: <tr>||<td width="64" valign="top">||</td>||<td width="102" valign="top">||</td>||<td width="368" valign="top">||</td>||</tr>||

Frequency: 167

Using this chain we could parse profiles successfully. But, this is not always possible.

Counter Example:

Here are 2 of the tag-chains from the Webpage of figure 4:

Chain 1: <tr>||<td width="100" align="right" class="SideMenuLink1">||</td>||<td class="SideMenuLink1">||</td>||</tr>||

Frequency: 80

Chain 2: <tr>||<td colspan="2" class="SideMenuTitle">||</td>||</tr>||

Frequency: 31

Here, chain 2 is the main-chain, which is not the most-frequent chain.

Figure 8:

This is an example of a simple html-parse-tree. Every vertex represents a single html-tag. The red-vertices contain the profile-data. The path from the root to a leaf is the tag-chain that we previously described. But, most of the html-parse-trees are more complex.

But we are not going to describe those complex cases which has been skiped by us.

Don't think you can't