/***************************************************************
 * file: ZONE.C
 * purpose: text region detection via cellular automata
 **************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include "zone.h"

/* local data */
static ZONE zone_list[MAX_ZONES];   /* this could be dynamically allocated */
static int zone_count;


/* local prototypes */
static unsigned char *sample_page(int *dx,int *dy,int *samplex,int *sampley);
static void cut_vertical_lines(unsigned char *image,int dx,int dy);
static void block_zones(unsigned char *image,int dx,int dy,int coarseness);
static void sequence_zones(unsigned char *image,int dx,int dy,int order);
static int extract_zone(unsigned char *image,int dx,int dy,int x,int y,ZONE *zone_ptr);
static void overlap_zones(ZONE *zone_array,int *array_size);


/************************************************
 * function: int zone(int coarseness)
 *  The process steps are:
 *      1) Sample the page
 *      2) Cut vertical lines from the page
 *      3) Block out zones via cellular automata
 *      4) Extract the zones
 *      5) Sequence zones
 * parameters: coarseness value 0-5, order (COLUMN or ROW)
 * returns: 1 if OK or 0 if error,  see errno
 ************************************************/
int zone(int coarseness,int order)
    {
    unsigned char *image;
    int dx,dy;
    int samplex,sampley;
    int i;

    if (coarseness < 0 || coarseness > 5)
        {               /* test coarseness parameter */
        errno = EINVAL;
        return 0;
        }
                        /* get a scaled copy of the page */
    image = sample_page(&dx,&dy,&samplex,&sampley);
    if (image == NULL)                      /* memory? */
        return 0;
#if CUT_VERTICAL_LINES
    cut_vertical_lines(image,dx,dy);        /* remove boxes around text */
#endif
    block_zones(image,dx,dy,coarseness);    /* block out zones */
    sequence_zones(image,dx,dy,order);
    for (i = 0 ; i < zone_count ; i++)
        {                                   /* translate to full page */
        zone_list[i].x1 *= samplex;
        zone_list[i].y1 *= sampley;
        zone_list[i].x2 *= samplex;
        zone_list[i].y2 *= sampley;
        }
    free(image);                            /* clean up */
    return 1;
    }


/**************** LOCAL FUNCTIONS ****************/

/************************************************
 * function: static unsigned char *sample_page(int *dx,int *dy,int *samplex,int *sampley)
 *  Sample the page.  Normally, the entire page is stored in memory.  Since
 *  the memory requirements are typically a megabyte, the page, in DOS
 *  machines, is somewhere in extended memory.  So that this demo can
 *  work on machines lacking extended memory, I sampled the page when I
 *  scaled it for display.  See display_sample() below.  However, I
 *  have #ifed around the code that is normally used to sample the page
 *  from the memory image.  You need to provide two functions, as well as
 *  the extended memory handler.  They are void memory_info(DISPLAY_BOX *);
 *  which gets the x/y extents of the image, and  (unsigned char *
 *  memory_ptr(int y); which returns a pointer to a scan line in the memory
 *  image.  Sample_page() creates a standardized, reduced image suitable for
 *  cellular automation.  The sample has each byte, not bit, representing a
 *  pixel area.  The byte is 0xff if black or 0x00 if white.
 *  The sampling procedure is important for region finding.  If possible
 *  it should be a function of the density of the original image.  If
 *  the image isn't square, for example a 200x100 fax file, then the
 *  x and y sampling should be adjusted accordingly.  Since I don't
 *  have dpi information here, I am scaling it to a typical, 200dpi,
 *  value after it was adjusted for screen display.
 *  Note: Bit 7 is leftmost and 0 is rightmost; 1 bits are black, 0 are white.
 * parameters: pointers to storage for the byte width and height of the sampled
 *              image and the sample bit distance in x and y
 * returns: pointer to sampled page or NULL if no memory
 ************************************************/
static unsigned char *sample_page(int *dx,int *dy,int *samplex,int *sampley)
    {
    static unsigned char bit_mask[] = {0x80,0x40,0x20,0x10,0x08,0x04,0x02,0x01};
    unsigned char *image,*line_ptr,*line_ptr2,*buff_ptr;
    DISPLAY_BOX file;
    unsigned int x,y,width,height;

    memory_info(&file);     /* need to provide this, gets file dimensions */
                            /* from memory image of file */
    *samplex = SAMPLE_200_X;        /* sample sizes */
    *sampley = SAMPLE_200_Y;
    *dx = file.width / *samplex;    /* extent of sample */
    *dy = file.height / *sampley;
    while (((long)*dx * (long)*dy) > MAX_MALLOC)
        {           /* adjust sampling to fit memory restrictions */
        (*samplex)++;
        (*sampley)++;
        *dx = file.width / *samplex;
        *dy = file.height / *sampley;
        }
    if ((image = malloc(*dx * *dy)) == NULL)    /* allocate sample buffer */
        return NULL;
    memset(image,WHITE,*dx * *dy);          /* set to white */
    width = *dx * *samplex;
    height = *dy * *sampley;
    if (*samplex >= 8)
        {                                   /* byte sampling */
        for (y = 0, buff_ptr = image ; y < height ; y += *sampley)
            {                               /* for each y sample */
                        /* need to provide memory_ptr which gets a pointer
                         * to a scan line from the memory image of the file */
            line_ptr = memory_ptr(y);
            line_ptr2 = memory_ptr(y + *sampley/2); /* double sample in y */
            for (x = 0 ; x < width ; x += *samplex, buff_ptr++)
                if (*(line_ptr+(x>>3)) | *(line_ptr2+(x>>3)))
                    *buff_ptr = BLACK;  /* if byte has black, set sample */
            }
        }
    else
        {                                   /* bit sampling */
        for (y = 0, buff_ptr = image ; y < height ; y += *sampley)
            {                               /* for each y sample */
                        /* need to provide memory_ptr which gets a pointer
                         * to a scan line from the memory image of the file */
            line_ptr = memory_ptr(y);
            line_ptr2 = memory_ptr(y + *sampley/2); /* double sample in y */
            for (x = 0 ; x < width ; x += *samplex, buff_ptr++)
                if ((*(line_ptr+(x>>3)) | *(line_ptr2+(x>>3))) & bit_mask[x&7])
                    *buff_ptr = BLACK;      /* if bit is black, set sample */
            }
        }
    return image;
    }


#if CUT_VERTICAL_LINES
/************************************************
 * function: static void cut_vertical_lines(unsigned char *image,int dx,int dy)
 *  Remove vertical lines from sample.  The purpose of this function is to
 *  unbox text.  Removing the vertical box lines accomplishes this.  Trying
 *  to remove the horizontal lines is dangerous because you might also remove
 *  the text.
 * parameters: pointer to sampled image buffer and x/y extents of buffer
 * returns: nothing
 ************************************************/
static void cut_vertical_lines(unsigned char *image,int dx,int dy)
    {
    int x,y,count,y1;
    unsigned char *ptr,*qtr;

    for (x = 0 ; x < dx ; x++)      /* scan image left to right */
        {
        ptr = image+x;
        count = 0;
        for (y = 0 ; y < dy ; y++, ptr += dx)
            {                   /* scan up and down counting line pixels */
            if (*ptr)
                count++;
            else
                count = 0;
            if (count >= VERTICAL_LINE_SIZE)
                {               /* we have a veritcal line */
                for (y1=y, qtr=ptr ; *ptr!=WHITE && y>=0 ; y--, ptr-=dx)
                    *ptr = WHITE;   /* white out moving up */
                for (y=y1+1, ptr=qtr+dx ; *ptr!=WHITE && y<dy ; ptr+=dx)
                    *ptr = WHITE;   /* white out moving down */
                count = 0;      /* reset counter */
                }
            }
        }
    }
#endif


/************************************************
 * function: static void block_zones(unsigned char *image,int dx,int dy,int coarseness)
 *  Use cellular automata to create blocks from image.  Each block represents
 *  a region of text.  There are two steps to the process.  First, use cellular
 *  automata to sketch the reason.  Second, use the coarseness (0-5) to coalesce
 *  the regions.  A higher coarseness value will make larger zones.
 * parameters: pointer to image buffer, x/y extents of buffer and coarseness
 *             of zoning (0-5)
 * returns: nothing
 ************************************************/
static void block_zones(unsigned char *image,int dx,int dy,int coarseness)
    {
    int i,j,x,y,score;
    unsigned char *line,*pixel;

    for (y=1, line=image+dx ; y < dy-1 ; y++, line+=dx)
        {   /* walk through buffer scanning for neighbors */
        for (x=1, pixel=line+1 ; x < dx-1 ; x++, pixel++)
            {
            score = 0;
            if (*(pixel-1) & ALIVE)     /* left */
                score++;
            if (*(pixel+1) & ALIVE)     /* right */
                score++;
            if (*(pixel-dx) & ALIVE)    /* up */
                score++;
            if (*(pixel+dx) & ALIVE)    /* down */
                score++;
            if (*pixel == WHITE && score >= LIFE_SCORE)
                *pixel = PREGNANT;
            else if (*pixel == BLACK && score <= DEATH_SCORE)
                *pixel = SICK;
            }
        }
    for (pixel = image ; pixel < image + dy * dx ; pixel++)
        {   /* birth and bury */
        if (*pixel == PREGNANT)
            *pixel = BLACK;
        else if (*pixel == SICK)
            *pixel = WHITE;
        }
    if (coarseness <= 0)        /* no need to coalesce */
        return;
    /* coalesce regions, based on coarseness, i.e. coarseness == 2
     * will close all horizontal/vertical gaps of 2 pixels */
    for (y=0, line=image+1 ; y < dy ; y++, line+=dx)
        {   /* coalesce horizontally */
        for (x=1, pixel=line ; x < dx-coarseness ; x++, pixel++)
            {
            if (*pixel == WHITE && *(pixel-1) == BLACK)
                {
                for (i = 1 ; i <= coarseness ; i++)
                    {
                    if (*(pixel+i))
                        {
                        for (i = 0 ; i < coarseness ; i++, pixel++)
                            *pixel = BLACK;
                        pixel--;
                        x += coarseness-1;
                        break;
                        }
                    }
                }
            }
        }
    for (x=0, line=image+dx ; x < dx ; x++, line++)
        {   /* coalesce vertically */
        for (y=1, pixel=line ; y < dy-coarseness ; y++, pixel+=dx)
            {
            if (*pixel == WHITE && *(pixel-dx) == BLACK)
                {
                for (i=1, j=dx ; i <= coarseness ; i++, j+=dx)
                    {
                    if (*(pixel+j))
                        {
                        for (i = 0 ; i < coarseness ; i++, pixel+=dx)
                            *pixel = BLACK;
                        pixel -= dx;
                        y += (coarseness-1);
                        break;
                        }
                    }
                }
            }
        }
    }


/************************************************
 * function: static void sequence_zones(unsigned char *image,int dx,int dy,int order)
 *  Extract zones from buffer, place in zone_list, update zone_count and
 *  sequence zones so they are in either COLUMN_MAJOR or ROW_MAJOR reading order.
 * parameters: pointer to image buffer and x/y extents of image in buffer
 *              order, COLUMN_MAJOR or ROW_MAJOR
 * returns: nothing
 ************************************************/
static void sequence_zones(unsigned char *image,int dx,int dy,int order)
    {
    int x,y,i,j,index,fudge_x,fudge_y;
    unsigned char *ptr;
    ZONE temp;

    for (y=0, zone_count=0 ; y < dy && zone_count < MAX_ZONES ; y+=MIN_Y_ZONE)
        {   /* extract zones from block images in y order */
        ptr = image + y * dx;
        for (x = 0 ; x < dx  ; x += MIN_X_ZONE)
            if (*(ptr+x))                       /* found point */
                {
                if (zone_count >= MAX_ZONES)
                    break;
                while (x > 0 && *(ptr+x-1))     /* back up to left side */
                    x--;
                if (extract_zone(image,dx,dy,x,y,zone_list+zone_count))
                    zone_count++;               /* get zone */
                }
        }
    if (zone_count == 0)
        {           /* if no zones, make the entire page one big zone */
        zone_list[0].x1 = zone_list[0].y1 = 0;
        zone_list[0].x2 = dx-1;
        zone_list[0].y2 = dy-1;
        zone_count = 1;
        return;
        }
    overlap_zones(zone_list,&zone_count);       /* remove overlap */
    if (order == COLUMN_MAJOR)
        {
        for (i = 0 ; i < zone_count-1 ; i++)
            {                                       /* sort on x1 */
            for (j = i+1 ; j < zone_count ; j++)
                {
                if (zone_list[j].x1 < zone_list[i].x1)
                    {
                    temp = zone_list[i];
                    zone_list[i] = zone_list[j];
                    zone_list[j] = temp;
                    }
                }
            }
        for (i = 0 ; i < zone_count-1 ; i++)
            {           /* order zones left to right, up to down */
            x = zone_list[i].x2;
            y = zone_list[i].y1;
            fudge_x = zone_list[i].x1+dx/20;    /* 5% slippage on alignment */
            for (j = i+1, index = -1 ; j < zone_count ; j++)
                {   /* find any zones above and within x extent of zone_list[i] */
                if (zone_list[j].x1 > x)
                    break;
                if (zone_list[j].y1 < y && zone_list[j].x1 <= fudge_x)
                    {
                    x = zone_list[j].x2;
                    y = zone_list[j].y1;
                    index = j;
                    }
                }
            if (index != -1)
                {
                temp = zone_list[i];
                zone_list[i] = zone_list[index];
                zone_list[index] = temp;
                }
            }
        }
    else        /* ROW_MAJOR */
        {   /* already sorted in y1 order */
        for (i = 0 ; i < zone_count-1 ; i++)
            {           /* order zones up to down, left to right */
            y = zone_list[i].y2;
            x = zone_list[i].x1;
            fudge_y = zone_list[i].y1+dy/20;    /* 5% slippage on alignment */
            for (j = i+1, index = -1 ; j < zone_count ; j++)
                {   /* find any zones left of and within y extent of zone_list[i] */
                if (zone_list[j].y1 > y)
                    break;
                if (zone_list[j].x1 < x && zone_list[j].y1 <= fudge_y)
                    {
                    y = zone_list[j].y2;
                    x = zone_list[j].x1;
                    index = j;
                    }
                }
            if (index != -1)
                {
                temp = zone_list[i];
                zone_list[i] = zone_list[index];
                zone_list[index] = temp;
                }
            }
        }
    }


/************************************************
 * function: static int extract_zone(unsigned char *image,int dx,int dy,int x,int y,ZONE *zone_ptr)
 *  Extracts a rectangular region from the buffer starting at a point.  It does
 *  this by wlaking the perimeter, recording the min and max x/y dimensions.
 *  If the region is big enough to be significant, it is saved as a zone and then
 *  whited out so it won't be reconsidered.
 * parameters: pointer to image buffer and dx/dy extents of that buffer.
 *             x & y point to start extracting the zone.
 *             pointer to storage for zone
 * returns: 1 if zone OK or 0 if not
 ************************************************/
static int extract_zone(unsigned char *image,int dx,int dy,int x,int y,ZONE *zone_ptr)
    {
    int ix,iy,minx,miny,maxx,maxy;  /* perimeter variables & min/max values */
    HEADING dir;                    /* current direction */
    unsigned char *previous,*next,*here;    /* buffer pointers */

    minx = maxx = ix = x;           /* preset min/max x/y and perimeter vars */
    miny = maxy = iy = y;
    dir  = GOING_UP;                /* starting direction */
    do      /* walk perimeter, recording min/max of rectangular region */
        {
        if (ix < minx)              /* update min/max */
            minx = ix;
        if (ix > maxx)
            maxx = ix;
        if (iy < miny)
            miny = iy;
        if (iy > maxy)
            maxy = iy;
        here = image + iy * dx + ix;    /* where are we? */
        next = here + dx;
        previous = here - dx;
        switch (dir)                    /* based on current direction, */
            {                           /* look around for next direction */
            case GOING_UP:
            if (ix > 0 && *(here-1))
                {
                ix--;
                dir = GOING_LEFT;
                break;
                }
            if (iy > 0 && *previous)
                {
                iy--;
                break;
                }
            if (ix < dx-1 && *(here+1))
                {
                ix++;
                dir = GOING_RIGHT;
                break;
                }
            if (iy < dy-1 && *next)
                {
                iy++;
                dir = GOING_DOWN;
                break;
                }
            break;
            case GOING_RIGHT:
            if (iy > 0 && *previous)
                {
                iy--;
                dir = GOING_UP;
                break;
                }
            if (ix < dx-1 && *(here+1))
                {
                ix++;
                break;
                }
            if (iy < dy-1 && *next)
                {
                iy++;
                dir = GOING_DOWN;
                break;
                }
            if (ix > 0 && *(here-1))
                {
                ix--;
                dir = GOING_LEFT;
                break;
                }
            break;
            case GOING_DOWN:
            if (ix < dx-1 && *(here+1))
                {
                ix++;
                dir = GOING_RIGHT;
                break;
                }
            if (iy < dy-1 && *next)
                {
                iy++;
                break;
                }
            if (ix > 0 && *(here-1))
                {
                ix--;
                dir = GOING_LEFT;
                break;
                }
            if (iy > 0 && *previous)
                {
                iy--;
                dir = GOING_UP;
                break;
                }
            break;
            case GOING_LEFT:
            if (iy < dy-1 && *next)
                {
                iy++;
                dir = GOING_DOWN;
                break;
                }
            if (ix > 0 && *(here-1))
                {
                ix--;
                break;
                }
            if (iy > 0 && *previous)
                {
                iy--;
                dir = GOING_UP;
                break;
                }
            if (ix < dx-1 && *(here+1))
                {
                ix++;
                dir = GOING_RIGHT;
                break;
                }
            break;
            }
        } while (ix != x || iy != y);       /* until we return to the start */
    for (iy=miny, here=image+miny*dx+minx ; iy <= maxy ; iy++, here+=dx)
        memset(here,WHITE,maxx-minx+1);     /* whiteout the region */
    if ((maxx-minx+1 < MIN_X_ZONE) || (maxy-miny+1 < MIN_Y_ZONE))
        return 0;                           /* big enough? */
    if (minx > 0)               /* expand dimensions by one pixel */
        minx--;
    if (maxx < dx-1)
        maxx++;
    if (miny > 0)
        miny--;
    if (maxy < dy-1)
        maxy++;
    zone_ptr->x1 = minx;        /* save zone */
    zone_ptr->y1 = miny;
    zone_ptr->x2 = maxx;
    zone_ptr->y2 = maxy;
    return 1;
    }


/************************************************
 * function: static void overlap_zones(ZONE *zone_array,int *array_size)
 *  find zones that overlap and combine them
 * parameters: pointer to array of zones and pointer to count of zones
 * returns: nothing
 ************************************************/
static void overlap_zones(ZONE *zone_array,int *array_size)
    {
    int i,j,ax1,ay1,ax2,ay2,bx1,by1,bx2,by2;

    for (i = 0 ; i < *array_size-1 ; i++)
        {   /* compare each zone against the rest */
        ax1 = (zone_array+i)->x1;
        ay1 = (zone_array+i)->y1;
        ax2 = (zone_array+i)->x2;
        ay2 = (zone_array+i)->y2;
        for (j = i+1 ; j < *array_size ; j++)
            {
            bx1 = (zone_array+j)->x1;
            by1 = (zone_array+j)->y1;
            bx2 = (zone_array+j)->x2;
            by2 = (zone_array+j)->y2;
            if ((bx1>=ax2) || (ax1>=bx2) || (by1>=ay2) || (ay1>=by2))
                continue;           /* no overlap */
            bx1 = min(bx1,ax1);     /* combine zones */
            by1 = min(by1,ay1);
            bx2 = max(bx2,ax2);
            by2 = max(by2,ay2);
            (zone_array+i)->x1 = bx1;
            (zone_array+i)->y1 = by1;
            (zone_array+i)->x2 = bx2;
            (zone_array+i)->y2 = by2;
            (*array_size)--;        /* new zone count */
            memmove((char *)(zone_array+j),(char *)(zone_array+j+1),
                sizeof(ZONE)*(*array_size-j));  /* shift array to remove zone */
            i = -1;                 /* start all over */
            break;
            }
        }
    }

