[FrontPage] [TitleIndex] [WordIndex

Note: You are looking at a static copy of the former PineWiki site, used for class notes by James Aspnes from 2003 to 2012. Many mathematical formulas are broken, and there are likely to be other bugs as well. These will most likely not be fixed. You may be able to find more up-to-date versions of some of these notes at http://www.cs.yale.edu/homes/aspnes/#classes.

1. Searching in columns

For this assignment you are to write a program search.c that finds all occurrences of a given string (the needle) in a text (the haystack). However, unlike standard-issue search programs like grep, your program should find vertical occurrences of the needle, when reading the text down in columns.

2. Example

In the text

aba
ab
ba

the string ba occurs in exactly one place: starting from column 1 of line 1 (counting from 0 in both cases):

aba
aB
bA

3. Input format

Your program should read the haystack from stdin. It should obtain the needle from argv[1]. It is good practice to test argc to make sure that argv[1] exists, but you will not be tested on any inputs that do not provide exactly one argument. You may assume that each line of the haystack is terminated by a newline and that neither the haystack nor the needle contain any null characters.

4. Output format

For each starting line and column where the needle matches, your program should output a single line of text giving the starting line and column numbers (counting from 0). This line should be formatted as if printed with the function call

   1     printf("%d %d\n", line, column);

If there are multiple matches, each should be printed in order of increasing line number first and then increasing column numbers within the same line. For example, the output from a search for aa in the haystack

aaa
aa
a

should be

0 0
0 1
1 0

5. Things to watch out for

5.1. Special characters

Your program should allow any character except '\0' or '\n' within both the haystack and needle. The column number of a character is defined as the number of characters that precede it in the same line, regardless of how it looks when printed on the screen (e.g. a tab character '\t' may produce up to 8 spaces on the screen but still counts as only one character for the purpose of computing column positions).

5.2. Short lines

A match can occur only in a column only if there are enough characters in each of the lines that the column spans. You will need to do something to make sure that searches do not attempt to include missing characters from short lines.

6. Submitting your program

Submit your program as usual with the command

/c/cs223/bin/submit 3 search.c

7. Testing your program

A public test script is available on the Zoo as /c/cs223/Hwk3/test.search.

8. Sample solutions

   1 /* Vertical word search program */
   2 /* Takes as input a sequence of lines of possibly varying length. */
   3 /* Searches each column for argv[1] */
   4 /* Prints the line number and column number of any matching locations (counting from 0) */
   5 /*  in increasing order by line number then column number. */
   6 
   7 #include <stdio.h>
   8 #include <stdlib.h>
   9 #include <string.h>
  10 
  11 typedef struct line {
  12     int len;		/* length of data */
  13     char *data;		/* null-terminated contents */
  14 } *Line;
  15 
  16 /* used internally by getline; initial size of buffer */
  17 #define INITIAL_LINE_SIZE (16)
  18 
  19 /* reads a line of text terminated by newline or end-of-file and returns it */
  20 /* returns 0 on allocation error or end-of-file at start of line */
  21 Line
  22 getline(FILE *f)
  23 {
  24     int size;
  25     Line line;
  26     int c;
  27 
  28     line = malloc(sizeof(*line));
  29     if(line == 0) return 0;
  30 
  31     size = INITIAL_LINE_SIZE;
  32     line->len = 0;
  33     line->data = malloc(size);
  34 
  35     for(;;) {
  36 	/* grow the buffer if full */
  37 	if(line->len >= size) {
  38 	    size *= 2;
  39 	    line->data = realloc(line->data, size);
  40 	    if(line->data == 0) {
  41 		free(line);
  42 		return 0;
  43 	    }
  44 	}
  45 	/* grab and parse the next character */
  46 	c = getc(f);
  47 	if(c == '\n') {
  48 	    /* we are done */
  49 	    break;
  50 	} else if(c == EOF) {
  51 	    if(line->len == 0) {
  52 		/* end of file at start of line */
  53 		free(line->data);
  54 		free(line);
  55 		return 0;
  56 	    } else {
  57 		/* we are done again */
  58 		break;
  59 	    }
  60 	} else {
  61 	    /* add it in */
  62 	    line->data[line->len++] = c;
  63 	}
  64     }
  65 
  66     /* we are done; clean up */
  67     /* mark end of string */
  68     line->data[line->len] = '\0';
  69     /* give up extra space in buffer */
  70     line->data = realloc(line->data, line->len + 1);
  71     return line;
  72 }
  73 
  74 void
  75 free_line(Line line)
  76 {
  77     free(line->data);
  78     free(line);
  79 }
  80 
  81 #define INITIAL_LINES (16)	/* initial size of lines buffer in getlines */
  82 
  83 /* get a bunch of lines, terminating when getline fails */
  84 /* returns an array of the lines in order, with the number of lines stored
  85  * in *n */
  86 /* returns 0 on allocation error */
  87 Line *
  88 getlines(FILE *f, int *n)
  89 {
  90     int size;
  91     Line *lines;
  92     Line next_line;
  93 
  94     size = INITIAL_LINES;
  95     *n = 0;
  96     lines = malloc(sizeof(*lines) * size);
  97     if(lines == 0) return 0;
  98 
  99     while((next_line = getline(f)) != 0) {
 100 	/* maybe grow buffer */
 101 	if(*n >= size) {
 102 	    size *= 2;
 103 	    lines = realloc(lines, sizeof(*lines) * size);
 104 	    if(lines == 0) return 0;
 105 	}
 106 	/* store the new line */
 107 	lines[(*n)++] = next_line;
 108     }
 109 
 110     /* shrink to fit */
 111     lines = realloc(lines, sizeof(*lines) * (*n));
 112     
 113     return lines;
 114 }
 115 
 116 void
 117 free_lines(Line *lines, int n)
 118 {
 119     int i;
 120 
 121     for(i = 0; i < n; i++) {
 122 	free_line(lines[i]);
 123     }
 124     free(lines);
 125 }
 126 
 127 /* return true if needle occurs in first numlines entries in lines */
 128 /* starting at position (line, column) */
 129 int
 130 occurs_at(Line *lines, int numlines, int line, int column, const char *needle)
 131 {
 132     /* strategy: we'll march line and needle together, returning 0 on mismatch */
 133     while(line < numlines && *needle) {
 134 	if(lines[line]->len <= column) {
 135 	    /* this line isn't long enough, we lose */
 136 	    return 0;
 137 	} else if(lines[line]->data[column] != *needle) {
 138 	    /* character doesn't match, we lose again */
 139 	    return 0;
 140 	} else {
 141 	    /* keep going */
 142 	    line++;
 143 	    needle++;
 144 	}
 145     }
 146 
 147     /* did we reach end of needle or end of lines? */
 148     return *needle == '\0';
 149 }
 150 
 151 int
 152 main(int argc, char **argv)
 153 {
 154     Line *lines;
 155     int n;
 156     int maxstart;	/* n - length of needle; last possible starting location */
 157     int line;
 158     int column;
 159     const char *needle;
 160 
 161     if(argc != 2) {
 162 	fprintf(stderr, "Usage: %s needle < lines\n", argv[0]);
 163 	return 1;
 164     }
 165 
 166     needle = argv[1];
 167 
 168     lines = getlines(stdin, &n);
 169 
 170     maxstart = n - strlen(needle);
 171 
 172     for(line = 0; line <= maxstart; line++) {
 173 	for(column = 0; column < lines[line]->len; column++) {
 174 	    if(occurs_at(lines, n, line, column, needle)) {
 175 		printf("%d %d\n", line, column);
 176 	    }
 177 	}
 178     }
 179 
 180     free_lines(lines, n);
 181 
 182     return 0;
 183 }
search.c

2014-06-17 11:58