OOFILE  1.9
oofwords.cpp
Go to the documentation of this file.
1 /* oofWords.cpp: oofWordParser class implementation
2  Note that within valid words, punctuation is ignored
3 */
4 
5 #include "oofpch_c.h" // for precompilation of core files
6 
7 #ifndef H_OOFIOS
8  #include "oofios.h"
9 #endif
10 #ifndef H_OOF2
11  #include "oof2.h"
12 #endif
13 #ifndef H_OOF3
14  #include "oof3.h"
15 #endif
16 #ifndef H_OOFREL
17  #include "oofrel.h"
18 #endif
19 #ifndef H_OOFQUERY
20  #include "oofquery.h"
21 #endif
22 #ifndef H_OOFRAM
23  #include "oofram.h"
24 #endif
25 #ifndef H_OOFWORDS
26  #include "oofwords.h"
27 #endif
28 #ifndef H_OOFFILES
29  #include "ooffiles.h"
30 #endif
31 
32 #include <ctype.h>
33 
34 #ifdef OOF_MEM_DEBUG_LAST_INCLUDE
35  #include OOF_MEM_DEBUG_LAST_INCLUDE
36 #endif
37 
38 #ifndef OOF_NO_STDLIB
39  #ifndef std
40  using namespace std;
41  #endif
42 #endif
43 
44 
45 
46 
47 // -------------------------------------------------------
48 // o o f W o r d P a r s e r
49 // -------------------------------------------------------
50 oofWordParser::oofWordParser(const char * s, unsigned short minWordLen,
51  const char ** stopWords, unsigned short numStopWords) :
52  mpString(0),
53  mpStringStart(0),
54  mMinWordLength(minWordLen),
55  mStopWords(stopWords),
56  mNumStopWords(numStopWords),
57  mGeneratedStrings(0),
58  mGeneratedCount(0),
59  mIterStrings(0)
60 {
61  if (s)
62  start(s);
63 } // oofWordParser
64 
65 
72 {
73  mReferences = 0; // prevent assertion in parent class if someone deletes us
74  // like having a stack-based word parser
75  if (mGeneratedStrings) {
76  const unsigned long numWords = mGeneratedCount;
77  for (unsigned int i=0; i<numWords; i++) {
78  #ifdef _Windows
79  delete[] (char*)(mGeneratedStrings[i]); // hack for Windows
80  #else
81  delete[] mGeneratedStrings[i];
82  #endif
83  }
84  delete[] mGeneratedStrings;
85  }
86 }
87 
88 
89 void
90 oofWordParser::start(const char* s)
91 // skips over non-alphanumerical characters
92 // and primes pointers to the first word
93 {
94  if (mGeneratedStrings) {
95  mIterStrings = 0;
97  strncpy(mWord, mpString, maxWordLength);
98  }
99  else {
100  // position start of string
101  mpStringStart = mpString = s;
102  bool firstWord = true;
103  do {
104  // skip over (invalid) word to space
105  int i;
106  if (!firstWord) {
108  for (i = 0; mpString[i]&&!isspace(mpString[i]); ++i)
109  ;
110  mpString += i;
111  }
112  // skip over spaces and non-printable characters
113  for (i = 0; mpString[i]&&!isalnum(mpString[i]); ++i)
114  ;
115  mpString += i;
116  // write each valid character to mWord
117  int p = 0;
118  mWordLength = 0;
119  for (i = 0; mpString[i]&&!isspace(mpString[i]); ++i)
120  if (!ispunct(mpString[i]))
121  mWord[p++] = mpString[i];
122  mWord[p] = '\0';
123  mWordLength = p;
124  firstWord = false;
125  } while ((mWordLength>0)&&!isValidWord(mWord));
126  }
127 } // start
128 
129 
136 // primes the private string pointer to the start
137 // of the next valid word
138 {
139  assert(mpStringStart);
140  if (mGeneratedStrings) {
141  mIterStrings++;
143  mWordLength = strlen(mpString);
144  strncpy(mWord, mpString, maxWordLength);
145  }
146  else do {
147  // skip over current word to space
148  int i;
150  for (i = 0; mpString[i]&&!isspace(mpString[i]); ++i)
151  ;
152  mpString += i;
153  // skip over spaces and non-printable characters
154  for (i = 0; mpString[i]&&!isalnum(mpString[i]); ++i)
155  ;
156  mpString += i;
157  // write each valid character to mWord
158  int p = 0;
159  mWordLength = 0;
160  for (i = 0; mpString[i]&&!isspace(mpString[i]); ++i)
161  if (!ispunct(mpString[i]))
162  mWord[p++] = mpString[i];
163  mWord[p] = '\0';
164  mWordLength = p;
165  } while ((mWordLength>0)&&!isValidWord(mWord));
166 } // next
167 
168 
169 bool oofWordParser::isValidWord(const char *s) const
170 // returns true if word is valid, otherwise false
171 {
172  // check for a word of valid length
173  if (mWordLength < mMinWordLength)
174  return false;
175  // check for a stop word
176  bool stopWord = false;
177  for (int i = 0; i < mNumStopWords; ++i)
178  if (OOF_stricmp(s,mStopWords[i])==0) {
179  stopWord = true;
180  break;
181  }
182  // return false if a stop word
183  if (stopWord)
184  return false;
185  return true;
186 } // isValidWord
187 
188 
195 void
197 {
198  assert(mpStringStart);
199  if (mGeneratedStrings) {
200  mIterStrings = 0;
202  strncpy(mWord, mpString, maxWordLength);
203  }
204  else {
205  if (mpStringStart != mpString) {
207  }
208  }
209 }
210 
211 
216 bool
218 {
219  assert(mpStringStart);
220  return (mpString[0]!=0);
221 }
222 
223 
227 const char*
229 {
230  assert(mpStringStart);
231  return mWord;
232 }
233 
234 
238 const char*
240 {
241  return mpStringStart;
242 }
243 
244 
245 
249 unsigned short&
251 {
252  return mMinWordLength;
253 }
254 
255 
256 void
258 {
259  for (start(); more(); next()) {
260  os << word() << endl;
261  }
262 }
263 
264 
268 void
270 {
271  if (mGeneratedStrings)
272  return; // have parsed!
273 
274  OOF_WordList tempWords;
275  for (start(); more(); next()) {
276  tempWords.append(word());
277  }
279 }
280 
281 
282 // -------------------------------------------------------
283 // O O F _ U n i q u e W o r d L i s t
284 // -------------------------------------------------------
285 // NOT YET IMPLEMENTED - use trie or something else that's better at contains()
286 
288 {
289 }
290 
291 
293 {
294  const unsigned long numWords = mWords.count();
295  for (unsigned int i=0; i<numWords; i++) {
296  char* aWord = (char*) mWords.value(i);
297  delete[] aWord;
298  }
299 }
300 
301 
302 
303 const char**
304 OOF_WordList::orphanStringArray(unsigned long& outCount)
305 {
306  const unsigned long numWords = mWords.count();
307  const char** retWords = new const char*[numWords];
308  for (unsigned int i=0; i<numWords; i++) {
309  retWords[i] = (char*) mWords.value(i);
310  mWords[i] = 0;
311  }
312  outCount = numWords;
313  return retWords;
314 }
315 
316 
317 bool
318 OOF_WordList::contains(const char* inWord) const
319 {
320  const unsigned long numWords = mWords.count();
321  for (unsigned int i=0; i<numWords; i++) {
322  const char* aWord = (const char*) mWords.value(i);
323  if (OOF_stricmp(aWord, inWord)==0)
324  return true;
325  }
326  return false;
327 }
328 
329 
330 void
331 OOF_WordList::append(const char* inWord)
332 {
333  const unsigned short wordLen = strlen(inWord);
334  char* newWord = new char[wordLen+1];
335  memcpy(newWord, inWord, wordLen);
336  newWord[wordLen] = '\0';
337  mWords.append((unsigned long)newWord);
338 }
339 
340 
341 // -------------------------------------------------------
342 // d b T a b l e
343 // -------------------------------------------------------
345 dbTable::hasWord(const char* str) const
346 {
348 }
349 
350 
352 dbTable::hasWordStartsWith(const char* str) const
353 {
355 }
356 
357 
359 dbTable::hasAnyWordsDelimited(const char* str, char delimiter)
360 {
362 }
363 
364 
366 dbTable::hasAllWordsDelimited(const char* str, char delimiter)
367 {
369 }
370 
371 
373 dbTable::hasAnyWordsOf(const char **schStrs, unsigned long count)
374 {
376 }
377 
378 
380 dbTable::hasAllWordsOf(const char **schStrs, unsigned long count)
381 {
383 }
384 
385 
388 {
389  inParser->generateSearchArray();
390  return hasAnyWordsOf(inParser->generatedStrings(), inParser->generatedCount());
391 }
392 
393 
396 {
397  inParser->generateSearchArray();
398  return hasAllWordsOf(inParser->generatedStrings(), inParser->generatedCount());
399 }
400 
401 
403 dbTable::hasAnyWordsStartsWithDelimited(const char* str, char delimiter)
404 {
406 }
407 
408 
410 dbTable::hasAllWordsStartsWithDelimited(const char* str, char delimiter)
411 {
413 }
414 
415 
417 dbTable::hasAnyWordsStartsWithOf(const char **schStrs, unsigned long count)
418 {
420 }
421 
422 
424 dbTable::hasAllWordsStartsWithOf(const char **schStrs, unsigned long count)
425 {
427 }
428 
429 
432 {
433  inParser->generateSearchArray();
434  return hasAllWordsStartsWithOf(inParser->generatedStrings(), inParser->generatedCount());
435 }
436 
437 
440 {
441  inParser->generateSearchArray();
442  return hasAnyWordsStartsWithOf(inParser->generatedStrings(), inParser->generatedCount());
443 }
444 
445 
446 // -------------------------------------------------------
447 // O O F _ m i x K e y w o r d a b l e
448 // -------------------------------------------------------
450  mParser(0),
451  mThisField(thisField)
452 {
453 }
454 
455 
457  mParser(rhs.mParser),
458  mThisField(rhsAsField)
459 {
460  if (mParser)
461  mParser->incRefs();
462 }
463 
464 
465 
467 {
468  if (mParser)
469  mParser->decRefs();
470 }
471 
472 
474 OOF_mixKeywordableField::hasWord(const char* str) const
475 {
477 }
478 
479 
482 {
484 }
485 
486 
488 OOF_mixKeywordableField::hasAnyWordsDelimited(const char* str, char delimiter)
489 {
491 }
492 
493 
495 OOF_mixKeywordableField::hasAllWordsDelimited(const char* str, char delimiter)
496 {
498 }
499 
500 
502 OOF_mixKeywordableField::hasAnyWordsOf(const char **schStrs, unsigned long count)
503 {
505 }
506 
507 
509 OOF_mixKeywordableField::hasAllWordsOf(const char **schStrs, unsigned long count)
510 {
512 }
513 
514 
517 {
518  inParser->generateSearchArray();
519  return hasAnyWordsOf(inParser->generatedStrings(), inParser->generatedCount());
520 }
521 
522 
525 {
526  inParser->generateSearchArray();
527  return hasAllWordsOf(inParser->generatedStrings(), inParser->generatedCount());
528 }
529 
530 
533 {
535 }
536 
537 
540 {
542 }
543 
544 
546 OOF_mixKeywordableField::hasAnyWordsStartsWithOf(const char **schStrs, unsigned long count)
547 {
549 }
550 
551 
553 OOF_mixKeywordableField::hasAllWordsStartsWithOf(const char **schStrs, unsigned long count)
554 {
556 }
557 
558 
561 {
562  inParser->generateSearchArray();
563  return hasAnyWordsStartsWithOf(inParser->generatedStrings(), inParser->generatedCount());
564 }
565 
566 
569 {
570  inParser->generateSearchArray();
571  return hasAllWordsStartsWithOf(inParser->generatedStrings(), inParser->generatedCount());
572 }
573 
574 
575 void
577 {
578 #ifdef OOF_DEBUG
580  dbConnect::raise( "Attempt to specify that standalone field has keyword index", false);
581  else
582 #endif
583  {
584  if (adoptedParser)
585  mParser = adoptedParser;
586  else {
588  mParser->incRefs();
589  }
590  }
591 }
592 
593 
596 {
597  if (mParser)
598  return mParser;
599  else
601 }
dbQueryBinaryNofield hasAnyWordsStartsWithOf(const char **schStrs, unsigned long count)
Definition: oofwords.cpp:417
dbQueryBinaryNofield hasAllWordsStartsWithDelimited(const char *, char delimiter)
Definition: oofwords.cpp:410
char mWord[maxWordLength]
Definition: oofwords.h:65
static void raise(std::ostream &, bool terminateAfterMsg=true)
virtual const char * word() const
returns the current word.
Definition: oofwords.cpp:228
precompilation header.
unsigned long mIterStrings
Definition: oofwords.h:72
unsigned long generatedCount() const
Definition: oofwords.h:87
Tries to hide the different platforms and version issues with standard IO.
String fragment arguments passed in as array of C strings.
Definition: oofquery.h:403
Base class for user-replaceable word parser.
Definition: oofwords.h:40
dbQueryBinaryNofield hasAnyWordsStartsWithDelimited(const char *, char delimiter)
Definition: oofwords.cpp:403
String fragment arguments passed in as single string with a delimiter character.
Definition: oofquery.h:369
dbQueryBinaryNofield hasWordStartsWith(const char *str) const
Definition: oofwords.cpp:352
bool fieldIsStandalone() const
Definition: oof3.h:733
dbQueryBinaryNofield hasAllWordsOf(const char **schStrs, unsigned long count)
Definition: oofwords.cpp:380
unsigned long count() const
Definition: oofarray.h:126
oofWordParser * getDefaultWordParser()
Definition: oof1.cpp:1132
int OOF_stricmp(const char *, const char *)
Compare strings ignoring case.
Definition: oofstr.cpp:59
dbQueryBinaryNofield hasAnyWordsDelimited(const char *, char delimiter)
Definition: oofwords.cpp:359
oofWordParser * wordParser() const
Definition: oofwords.cpp:595
Mixin class for fields that are keyword-indexed.
Definition: oof3.h:186
OOF_mixKeywordableField(dbField *)
Definition: oofwords.cpp:449
virtual void start()
sets the oofWordParser to the start of the string.
Definition: oofwords.cpp:196
const char * mpStringStart
pointer to original strings
Definition: oofwords.h:64
Common binary query for field, eg: People.Salary > 90000.
Definition: oofquery.h:165
unsigned long mGeneratedCount
Definition: oofwords.h:71
bool contains(const char *) const
Definition: oofwords.cpp:318
const char * mpString
pointer to start of words to process, may point to mGeneratedStrings entries
Definition: oofwords.h:63
dbQueryBinary hasWord(const char *str) const
Definition: oofwords.cpp:474
dbQueryBinary hasAnyWordsStartsWithOf(const char **schStrs, unsigned long count)
Definition: oofwords.cpp:546
dbQueryBinary hasAllWordsOf(const char **schStrs, unsigned long count)
Definition: oofwords.cpp:509
LHS argument to queries on fields.
Definition: oofquery.h:572
virtual ~OOF_mixKeywordableField()
Definition: oofwords.cpp:466
unsigned int mReferences
Definition: oof1.h:90
oofWordParser * mParser
Definition: oof3.h:214
virtual bool isValidWord(const char *) const
Definition: oofwords.cpp:169
virtual unsigned short & minWordLength()
returns/sets the minimum word length cut-off.
Definition: oofwords.cpp:250
dbQueryBinaryNofield hasAllWordsStartsWithOf(const char **schStrs, unsigned long count)
Definition: oofwords.cpp:424
Declare query classes.
dbTable * fieldTable() const
Definition: oof3.cpp:308
virtual void generateSearchArray()
Definition: oofwords.cpp:269
virtual ~oofWordParser()
dtor.
Definition: oofwords.cpp:71
virtual const char * wordString() const
returns the original string.
Definition: oofwords.cpp:239
dbQueryBinary hasWordStartsWith(const char *str) const
Definition: oofwords.cpp:481
void append(unsigned long)
Definition: oofarray.cpp:131
oofWordParser(const char *stringToParse=0, unsigned short minWordLen=3, const char **stopWords=0, unsigned short numStopWords=0)
Definition: oofwords.cpp:50
dbQueryBinary hasAllWordsStartsWithOf(const char **schStrs, unsigned long count)
Definition: oofwords.cpp:553
const char ** mGeneratedStrings
owned strings generated for subsearch call
Definition: oofwords.h:70
dbQueryBinary hasAnyWordsOf(const char **schStrs, unsigned long count)
Definition: oofwords.cpp:502
RHS String argument to queries on fields like dbChar.
Definition: oofquery.h:324
dbQueryBinary hasAllWordsStartsWithDelimited(const char *, char delimiter)
Definition: oofwords.cpp:539
dbQueryBinary hasAnyWordsStartsWithDelimited(const char *, char delimiter)
Definition: oofwords.cpp:532
dbQueryBinary hasAnyWordsDelimited(const char *, char delimiter)
Definition: oofwords.cpp:488
dbQueryBinaryNofield hasAnyWordsOf(const char **schStrs, unsigned long count)
Definition: oofwords.cpp:373
const char ** generatedStrings() const
return array of pointers to individual words.
Definition: oofwords.h:80
virtual bool more() const
return true if any words left.
Definition: oofwords.cpp:217
Contain a list of words parsed from an input string or field being word-indexed.
Definition: oofwords.h:107
unsigned short mMinWordLength
Definition: oofwords.h:67
dbQueryBinaryNofield hasWord(const char *str) const
Definition: oofwords.cpp:345
void decRefs()
Definition: oof1.cpp:3212
dbField * mThisField
Definition: oof3.h:215
void incRefs()
Definition: oof1.h:2677
dbQueryBinaryNofield hasAllWordsDelimited(const char *, char delimiter)
Definition: oofwords.cpp:366
virtual void next(void)
Find the next word, updating the mWord pointer returned by word();.
Definition: oofwords.cpp:135
Binary query where LHS is table, eg: dbTable::hasAllWordsDelimited.
Definition: oofquery.h:130
void append(const char *)
Definition: oofwords.cpp:331
void extract(std::ostream &)
Definition: oofwords.cpp:257
Base class for persistent fields in dbTable's.
Definition: oof3.h:63
unsigned long value(unsigned long index) const
Definition: oofarray.cpp:243
void indexWords(oofWordParser *adoptedParser=0)
Definition: oofwords.cpp:576
unsigned short mWordLength
Definition: oofwords.h:66
dbQueryBinary hasAllWordsDelimited(const char *, char delimiter)
Definition: oofwords.cpp:495
const char ** orphanStringArray(unsigned long &outCount)
Definition: oofwords.cpp:304