PLearn: TypesNumeriques.cc Source File

00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 1998 Pascal Vincent 00005 // Copyright (C) 1999,2000 Pascal Vincent, Yoshua Bengio and University of Montreal 00006 // 00007 // Redistribution and use in source and binary forms, with or without 00008 // modification, are permitted provided that the following conditions are met: 00009 // 00010 // 1. Redistributions of source code must retain the above copyright 00011 // notice, this list of conditions and the following disclaimer. 00012 // 00013 // 2. Redistributions in binary form must reproduce the above copyright 00014 // notice, this list of conditions and the following disclaimer in the 00015 // documentation and/or other materials provided with the distribution. 00016 // 00017 // 3. The name of the authors may not be used to endorse or promote 00018 // products derived from this software without specific prior written 00019 // permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00022 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00023 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00024 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00025 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00026 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00027 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00028 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00029 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 // 00032 // This file is part of the PLearn library. For more information on the PLearn 00033 // library, go to the PLearn Web site at www.plearn.org 00034 00035 /* ******************************************************* 00036 * $Id: TypesNumeriques.cc,v 1.2 2004/07/12 13:39:30 tihocan Exp $ 00037 * AUTHORS: Steven Pigeon & Yoshua Bengio 00038 * This file is part of the PLearn library. 00039 ******************************************************* */ 00040 00041 #include <cstdlib> 00042 #include <cstring> 00043 #include "TypesNumeriques.h" 00044 00045 namespace PLearn { 00046 using namespace std; 00047 00048 00050 const char DIGITsymbols[] = "0123456789"; 00051 const char ALPHAsymbols[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; 00052 const char *ORDINALS[] = {"d","nd","th","st",0}; // 3d, 2nd, 12th, 1st 00053 00055 const char *eNumericTypeNames(int a) 00056 { 00057 static char retour[128]; 00058 00059 // all that applies 00060 if (a==NT_NOT_NUMERIC) 00061 return "not numeric"; 00062 else { 00063 retour[0]=0; 00064 if (a & NT_ORDINAL) strcat(retour,"ordinal "); 00065 if (a & NT_CARDINAL) strcat(retour,"cardinal "); 00066 if (a & NT_CURRENCY) strcat(retour,"currency "); 00067 if (a & NT_PREFIXED) strcat(retour,"prefixe "); 00068 if (a & NT_SUFFIXED) strcat(retour,"suffixe "); 00069 if (a & NT_PERCENT) strcat(retour,"pourcentage "); 00070 if (a & NT_RANGE) strcat(retour,"range "); 00071 if (a & NT_TIME) strcat(retour,"temps "); 00072 if (a & NT_CODE) strcat(retour,"code "); 00073 if (a & NT_UNKNOWN_NUMERIC_TYPE) strcat(retour," ??? "); 00074 return retour; 00075 } 00076 } 00077 00079 bool containsChar(const char *s, const char *symbols) 00080 { 00081 bool found = false; 00082 int i=0; 00083 while (!found && symbols[i]) 00084 { 00085 found = (bool)strchr(s,symbols[i]); 00086 i++; 00087 } 00088 return found; 00089 } 00090 00091 00093 char * stringPos(const char *s, const char *strings[]) 00094 { 00095 char *t = 0; 00096 int i=0; 00097 while (!t && strings[i]) 00098 { 00099 t = strstr(s,strings[i]); 00100 i++; 00101 } 00102 return t; 00103 } 00104 00105 00107 bool looksNumeric(const char *s) 00108 { 00109 return containsChar(s,DIGITsymbols); 00110 } 00111 00113 bool elementOf(const char *s, const char t) 00114 { 00115 return (bool)strchr(s,t); 00116 } 00117 00119 void compactRepresentationTranslate(char *t) 00120 { 00121 int d=0; 00122 int s=0; 00123 00124 while (t[s]) 00125 { 00126 if (elementOf(DIGITsymbols,t[s])) 00127 { 00128 t[d++]='n'; 00129 // skip to the next non-digit 00130 do { s++; } while (t[s] && (elementOf(DIGITsymbols,t[s]) || (t[s]==',')) ); 00131 } 00132 else if (elementOf(ALPHAsymbols,t[s])) 00133 { 00134 if ( (stringPos(&t[s],ORDINALS)==&t[s]) // starts here 00135 && (t[d-1]=='n') ) // and the previous run was composed of digits 00136 t[d++]='o'; 00137 else t[d++]='a'; 00138 // skip to the next non-alpha 00139 do { s++; } while (t[s] && elementOf(ALPHAsymbols,t[s])); 00140 } 00141 else t[d++]=t[s++]; 00142 } 00143 t[d]=0; 00144 00145 } 00146 00148 void compactRepresentationShrinkNum(char *t) 00149 { 00150 // remplace n.n ou .n par n, 00151 // mais laisse les constructions du genre n.n.n intactes 00152 int d=0; 00153 int s=0; 00154 00155 while (t[s]) 00156 { 00157 if ( (strstr(&t[s],"n.n") == &t[s]) && 00158 (t[s+3]!='.') && 00159 ( (s-1<0) || (t[s-1]!='.') ) 00160 ) 00161 { 00162 t[d++]='n'; 00163 s+=3; 00164 } 00165 else if ( (strstr(&t[s],".n") == &t[s]) && 00166 (t[s+2]!='.') && 00167 ( (s-1<0) || t[s-1]!='n')) 00168 { 00169 t[d++]='n'; 00170 s+=2; 00171 } 00172 else t[d++]=t[s++]; 00173 } 00174 t[d]=0; 00175 } 00176 00178 void compactRepresentationRangesAndOrdinals(char *t) 00179 { 00180 // remplace n-n par r et no par o 00181 int d=0; 00182 int s=0; 00183 00184 while (t[s]) 00185 { 00186 if ( strstr(&t[s],"n-n") == &t[s]) 00187 { 00188 t[d++]='r'; 00189 s+=3; 00190 } 00191 else if ( strstr(&t[s],"no") == &t[s]) 00192 { 00193 t[d++]='o'; 00194 s+=2; 00195 } 00196 else t[d++]=t[s++]; 00197 } 00198 t[d]=0; 00199 } 00200 00202 void compactRepresentation(char *t) 00203 { 00204 compactRepresentationTranslate(t); // remplace les lettres et chiffres par des codes. 00205 compactRepresentationShrinkNum(t); // replace n.n par n, etc. 00206 compactRepresentationRangesAndOrdinals(t); // remplace n-n par r et no par o 00207 00208 int s=0; 00209 int d=0; 00210 00211 // strip les tirets - 00212 while (t[s]) 00213 if (t[s]!='-') 00214 t[d++]=t[s++]; 00215 else s++; 00216 00217 t[d]=0; 00218 00219 // copie une seule instance du meme symbole. 00220 s=0; 00221 d=0; 00222 while (t[s]) 00223 { 00224 t[d++]=t[s++]; 00225 while (t[s] && (t[s]==t[d-1])) s++; 00226 } 00227 00228 if (t[d-1]=='.') d--; // trailing . 00229 t[d]=0; 00230 00231 char c = '#'; 00232 d=0; 00233 do 00234 { 00235 char tt = t[d]; 00236 t[d]=c; 00237 c=tt; 00238 d++; 00239 } while (c); 00240 t[d]=0; 00241 } 00242 00244 int numericType(const char *mot) 00245 { 00246 if (looksNumeric(mot)) 00247 { 00248 int classe=0; 00249 char t[128]; 00250 bool pourcent=false; 00251 strcpy(t,mot); 00252 00253 compactRepresentation(t); 00254 00255 // skips the # in the begining 00256 00257 if (char *tt= strchr(t,'%')) 00258 *tt=0, pourcent = true; // delete trailing % 00259 00260 for (int i=0; (rules[i].pattern[0]) && (!classe); i++) 00261 if (strcmp(rules[i].pattern,t)==0) classe = rules[i].attributs; 00262 00263 if (pourcent) classe += NT_PERCENT; 00264 00265 return classe ? classe : NT_UNKNOWN_NUMERIC_TYPE; 00266 } 00267 else return NT_NOT_NUMERIC; 00268 } 00269 00270 00271 00272 } // end of namespace PLearn