Hier soir, j'ai testé rapidement l'installation de tsearch pourr le stemmer en français! Il m'est apparut que sur le site de snowball les fichiers ont été mis à jour récemment au cours du mois de février 2007. Hors moi, je les avais testé en début de février! Ce qui m'a empêché de faire mes tests.

Cependant celà marche pour les fichiers stem.c et stem.h d'avant . Les deux fichiers en question sont donc disponibles à http://www.davidgis.fr/download/stem.h et http://www.davidgis.fr/download/stem.c

Donc rapidement ici, je marque mes notes pour me rappeler comment j'ai fais sous UBUNTU DAPPER . Petit rappel: mes locales sont en UTF-8 donc celà ne marchera que pour des bases encodées en UTF-8!!!

postgres@bremko:~/tmp$ echo $LANG
fr_FR.UTF-8

INSTALLATION RAPIDE DE TOUT LE TOUTIM: SERVEUR+TSEARCH2+STEMMER+ISPELL

So Let's go

#
# Installation de PostgreSQL
# 
export PATH=/opt/pgsql/bin:$PATH
export PGDATA=/opt/pgsql/data
wget ftp://ftp.fr.postgresql.org//source/v8.2.3/postgresql-8.2.3.tar.bz2
tar xvjf postgresql-8.2.3.tar.bz2
cd postgresql-8.2.3
./configure --prefix=/opt/pgsql/ --enable-nls=fr
make
make install
mkdir $PGDATA
chown postgres:postgres $PGDATA
#
# Installation de Tsearch et du stemmer français
#
wget http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/tsearch_snowball_82.gz
gunzip tsearch_snowball_82.gz
patch -p0 < tsearch_snowball_82
cd contrib/tsearch2/
make install
cd gendict
wget http://www.davidgis.fr/download/stem.c
wget http://www.davidgis.fr/download/stem.h
./config.sh -n fr -s -p french_UTF_8 -i -v -c stem.c -h stem.h -C'Snowball stemmer for French'
cd ../../dict_fr && make install
#
# Initialisation du serveur et lancement
#
su postgres
export PATH=/opt/pgsql/bin:$PATH
export PGDATA=/opt/pgsql/data
initdb -A trust
pg_ctl start
sleep 3
#
# création d'une base en y chargeant les fonctionnalités de tsearch et du stemmer
#
createdb test
psql -d test -f /opt/pgsql/share/contrib/tsearch2.sql
psql -d test -f /opt/pgsql/share/contrib/dict_fr.sql
cd
mkdir tmp
cd tmp
wget http://www.davidgis.fr/download/tsearch2_french_files.zip
unzip tsearch2_french_files.zip
#
# Chargement du ispell
#
echo "BEGIN TRANSACTION;

UPDATE pg_ts_dict SET dict_initoption='$PWD/french.stop' WHERE dict_name = 'fr';

INSERT INTO pg_ts_cfg (ts_name, prs_name, locale) VALUES ('default_french', 'default', 'fr_FR.UTF-8');


insert into pg_ts_cfgmap values ('default_french','email','{simple}');
insert into pg_ts_cfgmap values ('default_french','file','{simple}');
insert into pg_ts_cfgmap values ('default_french','float','{simple}');
insert into pg_ts_cfgmap values ('default_french','host','{simple}');
insert into pg_ts_cfgmap values ('default_french','hword','{simple}');
insert into pg_ts_cfgmap values ('default_french','int','{simple}');
insert into pg_ts_cfgmap values ('default_french','lhword','{fr_ispell}');
insert into pg_ts_cfgmap values ('default_french','lpart_hword','{fr_ispell}');
insert into pg_ts_cfgmap values ('default_french','nlhword','{simple}');
insert into pg_ts_cfgmap values ('default_french','nlpart_hword','{simple}');
insert into pg_ts_cfgmap values ('default_french','part_hword','{simple}');
insert into pg_ts_cfgmap values ('default_french','sfloat','{simple}');
insert into pg_ts_cfgmap values ('default_french','uint','{simple}');
insert into pg_ts_cfgmap values ('default_french','uri','{simple}');
insert into pg_ts_cfgmap values ('default_french','url','{simple}');
insert into pg_ts_cfgmap values ('default_french','version','{simple}');
insert into pg_ts_cfgmap values ('default_french','word','{fr_ispell,fr}');
insert into pg_ts_cfgmap values ('default_french','nlword','{fr_ispell}');
insert into pg_ts_cfgmap values ('default_french','lword','{fr_ispell,fr,simple}');


 INSERT INTO pg_ts_dict
               (SELECT 'fr_ispell',
                       dict_init,
                       'DictFile="$PWD/french.dict",'
                       'AffFile="$PWD/french.aff",'
                       'StopFile="$PWD/french.stop"',
                       dict_lexize
                FROM pg_ts_dict
                WHERE dict_name = 'ispell_template');




END TRANSACTION;" | psql test

Quelques vérifications rapides

postgres@bremko:~/tmp$ psql -d test  -c "SELECT lexize('fr','anticonstitutionnellement');"
        lexize
-----------------------
 {anticonstitutionnel}
(1 ligne)

postgres@bremko:~/tmp$ psql -d test  -c "select ts_debug('Je m appelle David ,j'' habite à Montpellier et mon e-mail est davidtecher@yahoo.fr');"
                                        ts_debug
-----------------------------------------------------------------------------------------
 (default_french,lword,"Latin word",Je,"{fr_ispell,fr,simple}","")
 (default_french,lword,"Latin word",m,"{fr_ispell,fr,simple}","")
 (default_french,lword,"Latin word",appelle,"{fr_ispell,fr,simple}","'peler' 'appelle'")
 (default_french,lword,"Latin word",David,"{fr_ispell,fr,simple}",'david')
 (default_french,lword,"Latin word",j,"{fr_ispell,fr,simple}","")
 (default_french,lword,"Latin word",habite,"{fr_ispell,fr,simple}",'habite')
 (default_french,nlword,"Non-latin word",à,{fr_ispell},"")
 (default_french,lword,"Latin word",Montpellier,"{fr_ispell,fr,simple}",'montpellier')
 (default_french,lword,"Latin word",et,"{fr_ispell,fr,simple}","")
 (default_french,lword,"Latin word",mon,"{fr_ispell,fr,simple}","")
 (default_french,lhword,"Latin hyphenated word",e-mail,{fr_ispell},'mail')
 (default_french,lpart_hword,"Latin part of hyphenated word",e,{fr_ispell},"")
 (default_french,lpart_hword,"Latin part of hyphenated word",mail,{fr_ispell},'mail')
 (default_french,lword,"Latin word",est,"{fr_ispell,fr,simple}","")
 (default_french,email,Email,davidtecher@yahoo.fr,{simple},'davidtecher@yahoo.fr')
(15 lignes)