| Trees | Indices | Help |
|
|---|
|
|
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright 2006 Zuza Software Foundation
4 #
5 # This file is part of translate.
6 #
7 # translate is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
11 #
12 # translate is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
16 #
17 # You should have received a copy of the GNU General Public License
18 # along with translate; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
21 """A class that does terminology matching"""
22
23 import re
24
25 # We don't want to miss certain forms of words that only change a little
26 # at the end. Now we are tying this code to English, but it should serve
27 # us well. For example "category" should be found in "categories",
28 # "copy" should be found in "copied"
29 #
30 # The tuples define a regular expression to search for, and what with
31 # what it should be replaced.
32 ignorepatterns = [("y\s*$", "ie"), #category/categories, identify/identifies, apply/applied
33 ("[\s-]*", ""), #down time / downtime, pre-order / preorder
34 ("-", " "), #pre-order / pre order
35 (" ", "-"), #pre order / pre-order
36 ]
37
38 #TODO: compile regexes
39
43
45 """returns the match quality of term b in the text a"""
46 # We could segment the words, but mostly it will give less ideal
47 # results, since we'll miss plurals, etc. Then we also can't search for
48 # multiword terms, such as "Free Software". Ideally we should use a
49 # stemmer, like the Porter stemmer.
50
51 # So we just see if the word occurs anywhere. This is not perfect since
52 # we might get more than we bargained for. The term "form" will be found
53 # in the word "format", for example. A word like "at" will trigger too
54 # many false positives.
55
56 # First remove a possible disambiguating bracket at the end
57 b = re.sub("\s+\(.*\)\s*$", "", b)
58
59 if len(b) <= 2:
60 return 0
61
62 pos = a[:self.MAX_LEN].find(b)
63 if pos >= 0:
64 return 100 - pos * 10 / len(a[:self.MAX_LEN])
65
66 for ignorepattern in ignorepatterns:
67 newb = re.sub(ignorepattern[0], ignorepattern[1], b)
68 if newb in a[:self.MAX_LEN]:
69 return 80
70 return 0
71
| Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Wed Mar 26 12:49:37 2008 | http://epydoc.sourceforge.net |