mirror of
https://github.com/heyarne/berliner-winter.git
synced 2026-05-06 19:23:39 +02:00
Merge branch 'place_extraction'
Conflicts: analyze.py
This commit is contained in:
commit
41f2afcc12
21 changed files with 54650 additions and 0 deletions
78
analyze.py
78
analyze.py
|
|
@ -1,4 +1,82 @@
|
|||
<<<<<<< HEAD
|
||||
import requests, json
|
||||
=======
|
||||
import re
|
||||
import string
|
||||
from nltk.tag.stanford import POSTagger
|
||||
|
||||
tagger = POSTagger('./stanford-postagger-full-2014-10-26/models/german-fast.tagger',
|
||||
'./stanford-postagger-full-2014-10-26/stanford-postagger-3.5.0.jar',
|
||||
'UTF-8')
|
||||
|
||||
punctuation_regex = re.compile("[%s]" % re.escape(string.punctuation))
|
||||
|
||||
def get_potential_places(article_place, article_body):
|
||||
"""
|
||||
Returns a list of potential places as tuples with their part-of-speech tags
|
||||
for later filtering
|
||||
"""
|
||||
place_pos = tagger.tag(punctuation_regex.sub(" ", article_place).split())
|
||||
text_pos = tagger.tag(punctuation_regex.sub(" ", article_body).split())
|
||||
|
||||
# extract the places out of the full text
|
||||
places = [place_pos]
|
||||
is_matching = False
|
||||
current_match = []
|
||||
for tuple in text_pos:
|
||||
if is_matching:
|
||||
# when we're matching, the phrases we're looking for look like
|
||||
# "Im S-Bahnhof Wedding"... the tags below mean
|
||||
if tuple[1] in ("ART", "ADJA", "NN", "NE", "CARD"):
|
||||
current_match.append(tuple)
|
||||
else:
|
||||
# we stop the match, so append the current match
|
||||
places.append(current_match)
|
||||
current_match = []
|
||||
|
||||
# whe we're looking at a preposition again, just start new match
|
||||
if tuple[1] not in ("APPR", "APPRART"):
|
||||
is_matching = False
|
||||
else:
|
||||
# start matching when we have a preposition
|
||||
if tuple[1] in ("APPR", "APPRART"):
|
||||
is_matching = True
|
||||
|
||||
return places
|
||||
|
||||
def improve_potential_places(pos_tuples):
|
||||
"""
|
||||
Improves the matches' quality so we don't have to look up the lat-lng of so
|
||||
many mismatches
|
||||
"""
|
||||
better_tuples = []
|
||||
for tuple_list in pos_tuples:
|
||||
# first, exluce empty lists
|
||||
if tuple_list:
|
||||
cleaner_list = []
|
||||
|
||||
index = -1
|
||||
for tuple in tuple_list:
|
||||
index += 1
|
||||
|
||||
# exclude articles ("the", "a"), they only introduce noise, but
|
||||
# keep the list as a whole
|
||||
if tuple[1] == "ART":
|
||||
continue
|
||||
|
||||
# if we have numbers in the middle of our phrase, probably the
|
||||
# whole list is not useful (as opposed to e.g. Krügerstr. 22)
|
||||
if tuple[1] == "CARD" and index < len(tuple_list):
|
||||
cleaner_list = []
|
||||
break
|
||||
|
||||
cleaner_list.append(tuple)
|
||||
|
||||
if cleaner_list:
|
||||
better_tuples.append(cleaner_list)
|
||||
|
||||
return better_tuples
|
||||
>>>>>>> place_extraction
|
||||
|
||||
def get_district(article_headline):
|
||||
"""
|
||||
|
|
|
|||
339
stanford-postagger-full-2014-10-26/LICENSE.txt
Normal file
339
stanford-postagger-full-2014-10-26/LICENSE.txt
Normal file
|
|
@ -0,0 +1,339 @@
|
|||
GNU GENERAL PUBLIC LICENSE
|
||||
Version 2, June 1991
|
||||
|
||||
Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The licenses for most software are designed to take away your
|
||||
freedom to share and change it. By contrast, the GNU General Public
|
||||
License is intended to guarantee your freedom to share and change free
|
||||
software--to make sure the software is free for all its users. This
|
||||
General Public License applies to most of the Free Software
|
||||
Foundation's software and to any other program whose authors commit to
|
||||
using it. (Some other Free Software Foundation software is covered by
|
||||
the GNU Lesser General Public License instead.) You can apply it to
|
||||
your programs, too.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
this service if you wish), that you receive source code or can get it
|
||||
if you want it, that you can change the software or use pieces of it
|
||||
in new free programs; and that you know you can do these things.
|
||||
|
||||
To protect your rights, we need to make restrictions that forbid
|
||||
anyone to deny you these rights or to ask you to surrender the rights.
|
||||
These restrictions translate to certain responsibilities for you if you
|
||||
distribute copies of the software, or if you modify it.
|
||||
|
||||
For example, if you distribute copies of such a program, whether
|
||||
gratis or for a fee, you must give the recipients all the rights that
|
||||
you have. You must make sure that they, too, receive or can get the
|
||||
source code. And you must show them these terms so they know their
|
||||
rights.
|
||||
|
||||
We protect your rights with two steps: (1) copyright the software, and
|
||||
(2) offer you this license which gives you legal permission to copy,
|
||||
distribute and/or modify the software.
|
||||
|
||||
Also, for each author's protection and ours, we want to make certain
|
||||
that everyone understands that there is no warranty for this free
|
||||
software. If the software is modified by someone else and passed on, we
|
||||
want its recipients to know that what they have is not the original, so
|
||||
that any problems introduced by others will not reflect on the original
|
||||
authors' reputations.
|
||||
|
||||
Finally, any free program is threatened constantly by software
|
||||
patents. We wish to avoid the danger that redistributors of a free
|
||||
program will individually obtain patent licenses, in effect making the
|
||||
program proprietary. To prevent this, we have made it clear that any
|
||||
patent must be licensed for everyone's free use or not licensed at all.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
GNU GENERAL PUBLIC LICENSE
|
||||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||
|
||||
0. This License applies to any program or other work which contains
|
||||
a notice placed by the copyright holder saying it may be distributed
|
||||
under the terms of this General Public License. The "Program", below,
|
||||
refers to any such program or work, and a "work based on the Program"
|
||||
means either the Program or any derivative work under copyright law:
|
||||
that is to say, a work containing the Program or a portion of it,
|
||||
either verbatim or with modifications and/or translated into another
|
||||
language. (Hereinafter, translation is included without limitation in
|
||||
the term "modification".) Each licensee is addressed as "you".
|
||||
|
||||
Activities other than copying, distribution and modification are not
|
||||
covered by this License; they are outside its scope. The act of
|
||||
running the Program is not restricted, and the output from the Program
|
||||
is covered only if its contents constitute a work based on the
|
||||
Program (independent of having been made by running the Program).
|
||||
Whether that is true depends on what the Program does.
|
||||
|
||||
1. You may copy and distribute verbatim copies of the Program's
|
||||
source code as you receive it, in any medium, provided that you
|
||||
conspicuously and appropriately publish on each copy an appropriate
|
||||
copyright notice and disclaimer of warranty; keep intact all the
|
||||
notices that refer to this License and to the absence of any warranty;
|
||||
and give any other recipients of the Program a copy of this License
|
||||
along with the Program.
|
||||
|
||||
You may charge a fee for the physical act of transferring a copy, and
|
||||
you may at your option offer warranty protection in exchange for a fee.
|
||||
|
||||
2. You may modify your copy or copies of the Program or any portion
|
||||
of it, thus forming a work based on the Program, and copy and
|
||||
distribute such modifications or work under the terms of Section 1
|
||||
above, provided that you also meet all of these conditions:
|
||||
|
||||
a) You must cause the modified files to carry prominent notices
|
||||
stating that you changed the files and the date of any change.
|
||||
|
||||
b) You must cause any work that you distribute or publish, that in
|
||||
whole or in part contains or is derived from the Program or any
|
||||
part thereof, to be licensed as a whole at no charge to all third
|
||||
parties under the terms of this License.
|
||||
|
||||
c) If the modified program normally reads commands interactively
|
||||
when run, you must cause it, when started running for such
|
||||
interactive use in the most ordinary way, to print or display an
|
||||
announcement including an appropriate copyright notice and a
|
||||
notice that there is no warranty (or else, saying that you provide
|
||||
a warranty) and that users may redistribute the program under
|
||||
these conditions, and telling the user how to view a copy of this
|
||||
License. (Exception: if the Program itself is interactive but
|
||||
does not normally print such an announcement, your work based on
|
||||
the Program is not required to print an announcement.)
|
||||
|
||||
These requirements apply to the modified work as a whole. If
|
||||
identifiable sections of that work are not derived from the Program,
|
||||
and can be reasonably considered independent and separate works in
|
||||
themselves, then this License, and its terms, do not apply to those
|
||||
sections when you distribute them as separate works. But when you
|
||||
distribute the same sections as part of a whole which is a work based
|
||||
on the Program, the distribution of the whole must be on the terms of
|
||||
this License, whose permissions for other licensees extend to the
|
||||
entire whole, and thus to each and every part regardless of who wrote it.
|
||||
|
||||
Thus, it is not the intent of this section to claim rights or contest
|
||||
your rights to work written entirely by you; rather, the intent is to
|
||||
exercise the right to control the distribution of derivative or
|
||||
collective works based on the Program.
|
||||
|
||||
In addition, mere aggregation of another work not based on the Program
|
||||
with the Program (or with a work based on the Program) on a volume of
|
||||
a storage or distribution medium does not bring the other work under
|
||||
the scope of this License.
|
||||
|
||||
3. You may copy and distribute the Program (or a work based on it,
|
||||
under Section 2) in object code or executable form under the terms of
|
||||
Sections 1 and 2 above provided that you also do one of the following:
|
||||
|
||||
a) Accompany it with the complete corresponding machine-readable
|
||||
source code, which must be distributed under the terms of Sections
|
||||
1 and 2 above on a medium customarily used for software interchange; or,
|
||||
|
||||
b) Accompany it with a written offer, valid for at least three
|
||||
years, to give any third party, for a charge no more than your
|
||||
cost of physically performing source distribution, a complete
|
||||
machine-readable copy of the corresponding source code, to be
|
||||
distributed under the terms of Sections 1 and 2 above on a medium
|
||||
customarily used for software interchange; or,
|
||||
|
||||
c) Accompany it with the information you received as to the offer
|
||||
to distribute corresponding source code. (This alternative is
|
||||
allowed only for noncommercial distribution and only if you
|
||||
received the program in object code or executable form with such
|
||||
an offer, in accord with Subsection b above.)
|
||||
|
||||
The source code for a work means the preferred form of the work for
|
||||
making modifications to it. For an executable work, complete source
|
||||
code means all the source code for all modules it contains, plus any
|
||||
associated interface definition files, plus the scripts used to
|
||||
control compilation and installation of the executable. However, as a
|
||||
special exception, the source code distributed need not include
|
||||
anything that is normally distributed (in either source or binary
|
||||
form) with the major components (compiler, kernel, and so on) of the
|
||||
operating system on which the executable runs, unless that component
|
||||
itself accompanies the executable.
|
||||
|
||||
If distribution of executable or object code is made by offering
|
||||
access to copy from a designated place, then offering equivalent
|
||||
access to copy the source code from the same place counts as
|
||||
distribution of the source code, even though third parties are not
|
||||
compelled to copy the source along with the object code.
|
||||
|
||||
4. You may not copy, modify, sublicense, or distribute the Program
|
||||
except as expressly provided under this License. Any attempt
|
||||
otherwise to copy, modify, sublicense or distribute the Program is
|
||||
void, and will automatically terminate your rights under this License.
|
||||
However, parties who have received copies, or rights, from you under
|
||||
this License will not have their licenses terminated so long as such
|
||||
parties remain in full compliance.
|
||||
|
||||
5. You are not required to accept this License, since you have not
|
||||
signed it. However, nothing else grants you permission to modify or
|
||||
distribute the Program or its derivative works. These actions are
|
||||
prohibited by law if you do not accept this License. Therefore, by
|
||||
modifying or distributing the Program (or any work based on the
|
||||
Program), you indicate your acceptance of this License to do so, and
|
||||
all its terms and conditions for copying, distributing or modifying
|
||||
the Program or works based on it.
|
||||
|
||||
6. Each time you redistribute the Program (or any work based on the
|
||||
Program), the recipient automatically receives a license from the
|
||||
original licensor to copy, distribute or modify the Program subject to
|
||||
these terms and conditions. You may not impose any further
|
||||
restrictions on the recipients' exercise of the rights granted herein.
|
||||
You are not responsible for enforcing compliance by third parties to
|
||||
this License.
|
||||
|
||||
7. If, as a consequence of a court judgment or allegation of patent
|
||||
infringement or for any other reason (not limited to patent issues),
|
||||
conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot
|
||||
distribute so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you
|
||||
may not distribute the Program at all. For example, if a patent
|
||||
license would not permit royalty-free redistribution of the Program by
|
||||
all those who receive copies directly or indirectly through you, then
|
||||
the only way you could satisfy both it and this License would be to
|
||||
refrain entirely from distribution of the Program.
|
||||
|
||||
If any portion of this section is held invalid or unenforceable under
|
||||
any particular circumstance, the balance of the section is intended to
|
||||
apply and the section as a whole is intended to apply in other
|
||||
circumstances.
|
||||
|
||||
It is not the purpose of this section to induce you to infringe any
|
||||
patents or other property right claims or to contest validity of any
|
||||
such claims; this section has the sole purpose of protecting the
|
||||
integrity of the free software distribution system, which is
|
||||
implemented by public license practices. Many people have made
|
||||
generous contributions to the wide range of software distributed
|
||||
through that system in reliance on consistent application of that
|
||||
system; it is up to the author/donor to decide if he or she is willing
|
||||
to distribute software through any other system and a licensee cannot
|
||||
impose that choice.
|
||||
|
||||
This section is intended to make thoroughly clear what is believed to
|
||||
be a consequence of the rest of this License.
|
||||
|
||||
8. If the distribution and/or use of the Program is restricted in
|
||||
certain countries either by patents or by copyrighted interfaces, the
|
||||
original copyright holder who places the Program under this License
|
||||
may add an explicit geographical distribution limitation excluding
|
||||
those countries, so that distribution is permitted only in or among
|
||||
countries not thus excluded. In such case, this License incorporates
|
||||
the limitation as if written in the body of this License.
|
||||
|
||||
9. The Free Software Foundation may publish revised and/or new versions
|
||||
of the General Public License from time to time. Such new versions will
|
||||
be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the Program
|
||||
specifies a version number of this License which applies to it and "any
|
||||
later version", you have the option of following the terms and conditions
|
||||
either of that version or of any later version published by the Free
|
||||
Software Foundation. If the Program does not specify a version number of
|
||||
this License, you may choose any version ever published by the Free Software
|
||||
Foundation.
|
||||
|
||||
10. If you wish to incorporate parts of the Program into other free
|
||||
programs whose distribution conditions are different, write to the author
|
||||
to ask for permission. For software which is copyrighted by the Free
|
||||
Software Foundation, write to the Free Software Foundation; we sometimes
|
||||
make exceptions for this. Our decision will be guided by the two goals
|
||||
of preserving the free status of all derivatives of our free software and
|
||||
of promoting the sharing and reuse of software generally.
|
||||
|
||||
NO WARRANTY
|
||||
|
||||
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
|
||||
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
|
||||
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
|
||||
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
|
||||
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
|
||||
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
|
||||
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
|
||||
REPAIR OR CORRECTION.
|
||||
|
||||
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
|
||||
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
|
||||
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
|
||||
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
|
||||
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
|
||||
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
|
||||
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGES.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
convey the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If the program is interactive, make it output a short notice like this
|
||||
when it starts in an interactive mode:
|
||||
|
||||
Gnomovision version 69, Copyright (C) year name of author
|
||||
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
||||
This is free software, and you are welcome to redistribute it
|
||||
under certain conditions; type `show c' for details.
|
||||
|
||||
The hypothetical commands `show w' and `show c' should show the appropriate
|
||||
parts of the General Public License. Of course, the commands you use may
|
||||
be called something other than `show w' and `show c'; they could even be
|
||||
mouse-clicks or menu items--whatever suits your program.
|
||||
|
||||
You should also get your employer (if you work as a programmer) or your
|
||||
school, if any, to sign a "copyright disclaimer" for the program, if
|
||||
necessary. Here is a sample; alter the names:
|
||||
|
||||
Yoyodyne, Inc., hereby disclaims all copyright interest in the program
|
||||
`Gnomovision' (which makes passes at compilers) written by James Hacker.
|
||||
|
||||
<signature of Ty Coon>, 1 April 1989
|
||||
Ty Coon, President of Vice
|
||||
|
||||
This General Public License does not permit incorporating your program into
|
||||
proprietary programs. If your program is a subroutine library, you may
|
||||
consider it more useful to permit linking proprietary applications with the
|
||||
library. If this is what you want to do, use the GNU Lesser General
|
||||
Public License instead of this License.
|
||||
295
stanford-postagger-full-2014-10-26/README.txt
Normal file
295
stanford-postagger-full-2014-10-26/README.txt
Normal file
|
|
@ -0,0 +1,295 @@
|
|||
Stanford POS Tagger, v3.5.0 - 2014-10-26
|
||||
Copyright (c) 2002-2012 The Board of Trustees of
|
||||
The Leland Stanford Junior University. All Rights Reserved.
|
||||
|
||||
Original tagger author: Kristina Toutanova
|
||||
Code contributions: Christopher Manning, Dan Klein, William Morgan,
|
||||
Huihsin Tseng, Anna Rafferty, John Bauer
|
||||
Major rewrite for version 2.0 by Michel Galley.
|
||||
Current release prepared by: John Bauer
|
||||
|
||||
This package contains a Maximum Entropy part of speech tagger.
|
||||
|
||||
A Part-Of-Speech Tagger (POS Tagger) is a piece of software that reads
|
||||
text in some language and assigns parts of speech to each word (and
|
||||
other tokens), such as noun, verb, adjective, etc. Generally
|
||||
computational applications use more fine-grained POS tags like
|
||||
'noun-plural'. This software is a Java implementation of the log-linear
|
||||
part-of-speech (POS) taggers described in:
|
||||
|
||||
Kristina Toutanova and Christopher D. Manning. 2000. Enriching the
|
||||
Knowledge Sources Used in a Maximum Entropy Part-of-Speech
|
||||
Tagger. Proceedings of the Joint SIGDAT Conference on Empirical Methods
|
||||
in Natural Language Processing and Very Large Corpora (EMNLP/VLC-2000),
|
||||
Hong Kong.
|
||||
|
||||
Kristina Toutanova, Dan Klein, Christopher Manning, and Yoram
|
||||
Singer. 2003. Feature-Rich Part-of-Speech Tagging with a Cyclic
|
||||
Dependency Network. In Proceedings of HLT-NAACL 2003 pages 252-259.
|
||||
|
||||
The system requires Java 1.6+ to be installed. About 60 MB of memory is
|
||||
required to run a trained tagger, depending on the OS, tagging model
|
||||
chosen, etc. (i.e., you may need to give to java an option like java
|
||||
-mx120m). Plenty of memory is needed to train a tagger. It depends on
|
||||
the complexity of the model but at least 1GB is recommended (java
|
||||
-mx1g). Two trained tagger models for English are included with the
|
||||
tagger, along with some caseless versions, and we provide models for
|
||||
some other languages. The tagger can be retrained on other languages
|
||||
based on POS-annotated training text.
|
||||
|
||||
If you really want to use this software under Java 1.4, look into RetroWeaver:
|
||||
|
||||
http://retroweaver.sourceforge.net/
|
||||
|
||||
|
||||
QUICKSTART
|
||||
-----------------------------------------------
|
||||
|
||||
The Stanford POS Tagger is designed to be used from the command line or
|
||||
programmatically via its API.
|
||||
|
||||
There is a GUI interface, but it is for
|
||||
demonstration purposes only; most features of the tagger can only be
|
||||
accessed via the command line. To run the demonstration GUI you should
|
||||
be able to use any of the following 3 methods:
|
||||
|
||||
1)
|
||||
java -mx200m -classpath stanford-postagger.jar edu.stanford.nlp.tagger.maxent.MaxentTaggerGUI models/wsj-0-18-left3words.tagger
|
||||
|
||||
2)
|
||||
Providing your system gives java enough memory by default, you can also
|
||||
run it by either double-clicking the stanford-postagger.jar file, or
|
||||
giving the command:
|
||||
java -jar stanford-postagger.jar
|
||||
|
||||
3) Running the appropriate script for your operating system:
|
||||
stanford-postagger-gui.bat
|
||||
./stanford-postagger-gui.sh
|
||||
|
||||
To run the tagger from the command line, you can start with the provided
|
||||
script appropriate for you operating system:
|
||||
./stanford-postagger.sh models/wsj-0-18-left3words.tagger sample-input.txt
|
||||
stanford-postagger models\wsj-0-18-left3words.tagger sample-input.txt
|
||||
The output should match what is found in sample-output.txt
|
||||
|
||||
The tagger has three modes: tagging, training, and testing. Tagging
|
||||
allows you to use a pretrained model (two English models are included)
|
||||
to assign part of speech tags to unlabeled text. Training allows you to
|
||||
save a new model based on a set of tagged data that you provide.
|
||||
Testing allows you to see how well a tagger performs by tagging labeled
|
||||
data and evaluating the results against the correct tags.
|
||||
|
||||
Many options are available for training, tagging, and testing. These
|
||||
options can be set using a properties file. To start, you can generate a
|
||||
default properties file by:
|
||||
|
||||
java -classpath stanford-postagger.jar edu.stanford.nlp.tagger.maxent.MaxentTagger -genprops > myPropsFile.prop
|
||||
|
||||
This will create the file myPropsFile.prop with descriptions of each
|
||||
option for the tagger and the default values for these options
|
||||
specified. Any properties you can specify in a properties file can be
|
||||
specified on the command line or vice versa. For further information,
|
||||
please consult the Javadocs (start with the entry for MaxentTagger,
|
||||
which includes a table of all options which may be set to configure the
|
||||
tagger and descriptions of those options).
|
||||
|
||||
|
||||
To tag a file using the pre-trained bidirectional model
|
||||
=======================================================
|
||||
|
||||
java -mx300m -classpath stanford-postagger.jar edu.stanford.nlp.tagger.maxent.MaxentTagger -model models/wsj-0-18-bidirectional-distsim.tagger -textFile sample-input.txt > sample-tagged.txt
|
||||
|
||||
Tagged output will be printed to standard out, which you can redirect
|
||||
as above. Note that the bidirectional model is slightly more accurate
|
||||
but significantly slower than the left3words model.
|
||||
|
||||
To train a simple model
|
||||
=======================
|
||||
|
||||
java -classpath stanford-postagger.jar edu.stanford.nlp.tagger.maxent.MaxentTagger -prop propertiesFile -model modelFile -trainFile trainingFile
|
||||
|
||||
To test a model
|
||||
===============
|
||||
|
||||
java -classpath stanford-postagger.jar edu.stanford.nlp.tagger.maxent.MaxentTagger -prop propertiesFile -model modelFile -testFile testFile
|
||||
|
||||
|
||||
|
||||
CONTENTS
|
||||
-----------------------------------------------
|
||||
README.txt
|
||||
|
||||
This file.
|
||||
|
||||
LICENSE.txt
|
||||
|
||||
Stanford POS Tagger is licensed under the GNU General Public License (v2+).
|
||||
|
||||
stanford-postagger.jar
|
||||
stanford-postagger-YYYY-MM-DD.jar
|
||||
|
||||
This is a JAR file containing all the Stanford classes necessary to
|
||||
run the Stanford POS Tagger. The two jar files are identical. You can use
|
||||
either the one with a version (date) indication or without, as you prefer.
|
||||
|
||||
src
|
||||
|
||||
A directory containing the Java 1.5 source code for the Stanford POS
|
||||
Tagger distribution.
|
||||
|
||||
build.xml, Makefile
|
||||
|
||||
Files for building the distribution (with ant and make, respectively)
|
||||
|
||||
models
|
||||
|
||||
A directory containing trained POS taggers; the taggers end in ".tagger"
|
||||
and the props file used to make the taggers end in ".props". The
|
||||
".props" files cannot be directly used on your own machine as they use
|
||||
paths on the Stanford NLP machines, but they may serve as examples for
|
||||
your own properties files. Included in the full version are other
|
||||
English taggers, a German tagger, an Arabic tagger, and a Chinese
|
||||
tagger. If you chose to download the smaller version of the tagger,
|
||||
you have only two English taggers (left3words is faster but slightly
|
||||
less accurate than bidirectional-distsim) - feel free to download any
|
||||
other taggers you need from the POS tagger website. More information
|
||||
about the models can be found in the README-Models.txt file in this
|
||||
directory.
|
||||
|
||||
sample-input.txt
|
||||
|
||||
A sample text file that you can tag to demonstrate the tagger.
|
||||
|
||||
sample-output.txt
|
||||
|
||||
Tagged output of the tagger (using the left3words model)
|
||||
|
||||
stanford-postagger-gui.sh
|
||||
stanford-postagger-gui.bat
|
||||
|
||||
Scripts for invoking the GUI demonstration version of the tagger.
|
||||
|
||||
stanford-postagger.sh
|
||||
stanford-postagger.bat
|
||||
|
||||
Scripts for running the command-line version of the tagger.
|
||||
|
||||
javadoc
|
||||
|
||||
Javadocs for the distribution. In particular, look at the javadocs
|
||||
for the class edu.stanford.nlp.tagger.maxent.MaxentTagger.
|
||||
|
||||
TaggerDemo.java
|
||||
|
||||
A sample file for how to call the tagger in your own program. You
|
||||
should be able to compile and run it with:
|
||||
|
||||
javac -cp stanford-postagger.jar TaggerDemo.java
|
||||
java -cp ".:stanford-postagger.jar" TaggerDemo models/wsj-0-18-left3words.tagger sample-input.txt
|
||||
|
||||
(If you are on Windows, you need to replace the ":" with a ";" in the
|
||||
-cp argument, and should use a "\" in place of the "/" in the filename....)
|
||||
|
||||
THANKS
|
||||
-----------------------------------------------
|
||||
|
||||
Thanks to the members of the Stanford Natural Language Processing Lab
|
||||
for great collaborative work on Java libraries for natural language
|
||||
processing.
|
||||
|
||||
http://nlp.stanford.edu/javanlp/
|
||||
|
||||
CHANGES
|
||||
-----------------------------------------------
|
||||
|
||||
2014-10-26 3.5.0 Upgrade to Java 1.8; add annotators for
|
||||
dependency parsing and relation extraction
|
||||
|
||||
2014-10-26 3.5.0 Upgrade to Java 1.8; add annotators for
|
||||
dependency parsing and relation extraction
|
||||
|
||||
2014-08-27 3.4.1 Add Spanish models
|
||||
|
||||
2014-06-16 3.4 Using CC tagset for French
|
||||
|
||||
2014-01-04 3.3.1 Bugfix release
|
||||
|
||||
2013-11-12 3.3.0 Add imperatives to English training data
|
||||
|
||||
2013-06-19 3.2.0 Decrease size and improve speed of tagger
|
||||
models for all languages
|
||||
|
||||
2013-04-04 3.1.5 Speed improvements, ctb7 model, -nthreads
|
||||
option
|
||||
|
||||
2012-11-11 3.1.4 Updated Chinese model
|
||||
|
||||
2012-07-09 3.1.3 Minor bug fixes
|
||||
|
||||
2012-05-22 3.1.2 Updated for compatibility with other releases
|
||||
|
||||
2012-03-09 3.1.1 Caseless models added
|
||||
|
||||
2012-01-06 3.1.0 French tagger added, tagging speed improved
|
||||
|
||||
2011-09-14 3.0.4 Updated for compatibility with other releases
|
||||
|
||||
2011-06-15 3.0.3 Updated for compatibility with other releases
|
||||
|
||||
2011-05-15 3.0.2 Can read training files in TSV format
|
||||
|
||||
2011-04-17 3.0.1 Improved German and Arabic models
|
||||
Compatible with other Stanford releases
|
||||
|
||||
2010-05-21 3.0.0 Re-entrant
|
||||
|
||||
LICENSE
|
||||
-----------------------------------------------
|
||||
|
||||
Stanford POS Tagger
|
||||
Copyright (c) 2002-2010 The Board of Trustees of
|
||||
The Leland Stanford Junior University. All Rights Reserved.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
For more information, bug reports, fixes, contact:
|
||||
Christopher Manning
|
||||
Dept of Computer Science, Gates 1A
|
||||
Stanford CA 94305-9010
|
||||
USA
|
||||
Support/Questions: java-nlp-user@lists.stanford.edu
|
||||
Licensing: java-nlp-support@lists.stanford.edu
|
||||
http://www-nlp.stanford.edu/software/tagger.shtml
|
||||
|
||||
|
||||
CONTACT
|
||||
-----------------------------------------------
|
||||
|
||||
For questions about the Stanford POS tagger, please feel free to contact
|
||||
the Stanford JavaNLP user community at the mailing list
|
||||
java-nlp-user@lists.stanford.edu. You need to be a member of this
|
||||
mailing list to be able to post to it. Join the list either by emailing
|
||||
java-nlp-user-join@lists.stanford.edu (leave the subject and message
|
||||
body empty) or by using the web interface at:
|
||||
|
||||
https://mailman.stanford.edu/mailman/listinfo/java-nlp-user
|
||||
|
||||
This is the best list to post to in order to ask questions, make
|
||||
announcements, or for discussion among Stanford JavaNLP tool users. We
|
||||
provide assistance on a best-effort basis. You can also look at the list
|
||||
archives via https://mailman.stanford.edu/pipermail/java-nlp-user/. For
|
||||
licensing questions, please see the tagger webpage or contact Stanford
|
||||
JavaNLP at java-nlp-support@lists.stanford.edu.
|
||||
|
||||
28
stanford-postagger-full-2014-10-26/TaggerDemo.java
Normal file
28
stanford-postagger-full-2014-10-26/TaggerDemo.java
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileReader;
|
||||
import java.util.List;
|
||||
|
||||
import edu.stanford.nlp.ling.Sentence;
|
||||
import edu.stanford.nlp.ling.TaggedWord;
|
||||
import edu.stanford.nlp.ling.HasWord;
|
||||
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
|
||||
|
||||
class TaggerDemo {
|
||||
|
||||
private TaggerDemo() {}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if (args.length != 2) {
|
||||
System.err.println("usage: java TaggerDemo modelFile fileToTag");
|
||||
return;
|
||||
}
|
||||
MaxentTagger tagger = new MaxentTagger(args[0]);
|
||||
List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader(args[1])));
|
||||
for (List<HasWord> sentence : sentences) {
|
||||
List<TaggedWord> tSentence = tagger.tagSentence(sentence);
|
||||
System.out.println(Sentence.listToString(tSentence, false));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
58
stanford-postagger-full-2014-10-26/TaggerDemo2.java
Normal file
58
stanford-postagger-full-2014-10-26/TaggerDemo2.java
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.List;
|
||||
|
||||
import edu.stanford.nlp.ling.Sentence;
|
||||
import edu.stanford.nlp.ling.TaggedWord;
|
||||
import edu.stanford.nlp.ling.HasWord;
|
||||
import edu.stanford.nlp.ling.CoreLabel;
|
||||
import edu.stanford.nlp.process.CoreLabelTokenFactory;
|
||||
import edu.stanford.nlp.process.DocumentPreprocessor;
|
||||
import edu.stanford.nlp.process.PTBTokenizer;
|
||||
import edu.stanford.nlp.process.TokenizerFactory;
|
||||
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
|
||||
|
||||
/** This demo shows user-provided sentences (i.e., {@code List<HasWord>})
|
||||
* being tagged by the tagger. The sentences are generated by direct use
|
||||
* of the DocumentPreprocessor class.
|
||||
*
|
||||
* @author Christopher Manning
|
||||
*/
|
||||
class TaggerDemo2 {
|
||||
|
||||
private TaggerDemo2() {}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if (args.length != 2) {
|
||||
System.err.println("usage: java TaggerDemo2 modelFile fileToTag");
|
||||
return;
|
||||
}
|
||||
MaxentTagger tagger = new MaxentTagger(args[0]);
|
||||
TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
|
||||
"untokenizable=noneKeep");
|
||||
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
|
||||
PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
|
||||
DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
|
||||
documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
|
||||
for (List<HasWord> sentence : documentPreprocessor) {
|
||||
List<TaggedWord> tSentence = tagger.tagSentence(sentence);
|
||||
pw.println(Sentence.listToString(tSentence, false));
|
||||
}
|
||||
|
||||
// print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
|
||||
List<HasWord> sent = Sentence.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
|
||||
List<TaggedWord> taggedSent = tagger.tagSentence(sent);
|
||||
for (TaggedWord tw : taggedSent) {
|
||||
if (tw.tag().startsWith("JJ")) {
|
||||
pw.println(tw.word());
|
||||
}
|
||||
}
|
||||
|
||||
pw.close();
|
||||
}
|
||||
|
||||
}
|
||||
206
stanford-postagger-full-2014-10-26/build.xml
Normal file
206
stanford-postagger-full-2014-10-26/build.xml
Normal file
|
|
@ -0,0 +1,206 @@
|
|||
<!-- build.xml file for ant for JavaNLP -->
|
||||
|
||||
<!-- A "project" describes a set of targets that may be requested
|
||||
when Ant is executed. The "default" attribute defines the
|
||||
target which is executed if no specific target is requested,
|
||||
and the "basedir" attribute defines the current working directory
|
||||
from which Ant executes the requested task. This is normally
|
||||
set to the current working directory.
|
||||
-->
|
||||
|
||||
<project name="JavaNLP" default="compile" basedir=".">
|
||||
|
||||
<property name="build.home" value="${basedir}/classes"/>
|
||||
<property name="build.tests" value="${basedir}/classes"/>
|
||||
<property name="docs.home" value="${basedir}/docs"/>
|
||||
<property name="src.home" value="${basedir}/src"/>
|
||||
<property name="javadoc.home" value="${basedir}/javadoc"/>
|
||||
|
||||
|
||||
<!-- ==================== Compilation Control Options ==================== -->
|
||||
|
||||
<!--
|
||||
|
||||
These properties control option settings on the Javac compiler when it
|
||||
is invoked using the <javac> task.
|
||||
|
||||
compile.debug Should compilation include the debug option?
|
||||
|
||||
compile.deprecation Should compilation include the deprecation option?
|
||||
|
||||
compile.optimize Should compilation include the optimize option?
|
||||
|
||||
compile.source Source version compatibility
|
||||
|
||||
compile.target Target class version compatibility
|
||||
|
||||
-->
|
||||
|
||||
<property name="compile.debug" value="true"/>
|
||||
<property name="compile.deprecation" value="false"/>
|
||||
<property name="compile.optimize" value="true"/>
|
||||
<property name="compile.source" value="1.8" />
|
||||
<property name="compile.target" value="1.8" />
|
||||
|
||||
|
||||
<!-- ==================== All Target ====================================== -->
|
||||
|
||||
<!--
|
||||
|
||||
The "all" target is a shortcut for running the "clean" target followed
|
||||
by the "compile" target, to force a complete recompile.
|
||||
|
||||
-->
|
||||
|
||||
<target name="all" depends="clean,compile"
|
||||
description="Clean build and dist directories, then compile"/>
|
||||
|
||||
|
||||
|
||||
<!-- ==================== Clean Target ==================================== -->
|
||||
|
||||
<!--
|
||||
|
||||
The "clean" target deletes any previous "build" and "dist" directory,
|
||||
so that you can be ensured the application can be built from scratch.
|
||||
|
||||
-->
|
||||
|
||||
<target name="clean" description="Delete old classes">
|
||||
<delete dir="${build.home}/edu"/>
|
||||
</target>
|
||||
|
||||
|
||||
<!-- ==================== Classpath Targets ==================================== -->
|
||||
|
||||
<!--
|
||||
|
||||
Sets the classpath for this project properly. We now always use the
|
||||
lib dir within javanlp.
|
||||
|
||||
-->
|
||||
|
||||
<target name="classpath" description="Sets the classpath">
|
||||
<path id="compile.classpath">
|
||||
<!-- <fileset dir="${basedir}/lib">
|
||||
<include name="*.jar"/>
|
||||
<exclude name="javanlp*"/>
|
||||
</fileset> -->
|
||||
</path>
|
||||
</target>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<!-- ==================== Compile Target ================================== -->
|
||||
|
||||
<!--
|
||||
|
||||
The "compile" target transforms source files (from your "src" directory)
|
||||
into object files in the appropriate location in the build directory.
|
||||
This example assumes that you will be including your classes in an
|
||||
unpacked directory hierarchy under "/WEB-INF/classes".
|
||||
|
||||
-->
|
||||
|
||||
<target name="compile" depends="prepare,classpath"
|
||||
description="Compile Java sources">
|
||||
|
||||
<!-- Compile Java classes as necessary -->
|
||||
<mkdir dir="${build.home}"/>
|
||||
<javac srcdir="${src.home}"
|
||||
destdir="${build.home}"
|
||||
debug="${compile.debug}"
|
||||
encoding="utf-8"
|
||||
deprecation="${compile.deprecation}"
|
||||
optimize="${compile.optimize}"
|
||||
source="${compile.source}"
|
||||
target="${compile.target}"
|
||||
includeantruntime="false">
|
||||
<classpath refid="compile.classpath"/>
|
||||
<compilerarg value="-Xmaxerrs"/>
|
||||
<compilerarg value="20"/>
|
||||
<!-- <compilerarg value="-Xlint"/> -->
|
||||
</javac>
|
||||
|
||||
<!-- Copy application resources -->
|
||||
<!--
|
||||
<copy todir="${build.home}/WEB-INF/classes">
|
||||
<fileset dir="${src.home}" excludes="**/*.java"/>
|
||||
</copy>
|
||||
-->
|
||||
|
||||
</target>
|
||||
|
||||
<!-- ==================== Jar Target ================================== -->
|
||||
|
||||
<!--
|
||||
The "jar" target recreates the jar file, which you may want to do if
|
||||
you take apart the source jar and change something.
|
||||
-->
|
||||
|
||||
<property name="jar.output" value="stanford-postagger.jar" />
|
||||
<property name="jar.mainclass" value="edu.stanford.nlp.tagger.maxent.MaxentTaggerGUI" />
|
||||
|
||||
<target name="jar" depends="compile" description="Build a jar file">
|
||||
<jar destfile="${jar.output}">
|
||||
<fileset dir="${build.home}"/>
|
||||
<manifest>
|
||||
<attribute name="Main-Class" value="${jar.mainclass}"/>
|
||||
</manifest>
|
||||
</jar>
|
||||
</target>
|
||||
|
||||
<!-- ==================== Javadoc Target ================================== -->
|
||||
|
||||
<!--
|
||||
|
||||
The "javadoc" target creates Javadoc API documentation for the Java
|
||||
classes included in your application. Normally, this is only required
|
||||
when preparing a distribution release, but is available as a separate
|
||||
target in case the developer wants to create Javadocs independently.
|
||||
|
||||
-->
|
||||
|
||||
<target name="javadoc" depends="compile"
|
||||
description="Create Javadoc API documentation">
|
||||
|
||||
<mkdir dir="${javadoc.home}"/>
|
||||
<javadoc sourcepath="${src.home}"
|
||||
destdir="${javadoc.home}"
|
||||
maxmemory="768m"
|
||||
author="true"
|
||||
source="1.6"
|
||||
Overview="${src.home}/edu/stanford/nlp/overview.html"
|
||||
Doctitle="Stanford JavaNLP API Documentation"
|
||||
Windowtitle="Stanford JavaNLP API"
|
||||
packagenames="*">
|
||||
<bottom><![CDATA[<FONT SIZE=2><A HREF=\"http://nlp.stanford.edu\">Stanford NLP Group</A></FONT>]]></bottom>
|
||||
<link href="http://java.sun.com/j2se/1.6.0/docs/api/"/>
|
||||
</javadoc>
|
||||
|
||||
</target>
|
||||
|
||||
|
||||
<!-- ==================== Prepare Target ================================== -->
|
||||
|
||||
<!--
|
||||
|
||||
The "prepare" target is used to create the "build" destination directory,
|
||||
and copy the static contents of your web application to it. If you need
|
||||
to copy static files from external dependencies, you can customize the
|
||||
contents of this task.
|
||||
|
||||
Normally, this task is executed indirectly when needed.
|
||||
|
||||
-->
|
||||
|
||||
<target name="prepare">
|
||||
|
||||
<!-- Create build directories as needed -->
|
||||
<mkdir dir="${build.home}"/>
|
||||
|
||||
</target>
|
||||
|
||||
</project>
|
||||
53449
stanford-postagger-full-2014-10-26/data/enclitic-inflections.data
Normal file
53449
stanford-postagger-full-2014-10-26/data/enclitic-inflections.data
Normal file
File diff suppressed because it is too large
Load diff
121
stanford-postagger-full-2014-10-26/models/README-Models.txt
Normal file
121
stanford-postagger-full-2014-10-26/models/README-Models.txt
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
Stanford POS Tagger, v3.5.0 - 2014-10-26
|
||||
Copyright (c) 2002-2012 The Board of Trustees of
|
||||
The Leland Stanford Junior University. All Rights Reserved.
|
||||
|
||||
This document contains (some) information about the models included in
|
||||
this release and that may be downloaded for the POS tagger website at
|
||||
http://nlp.stanford.edu/software/tagger.shtml . If you have downloaded
|
||||
the full tagger, all of the models mentioned in this document are in the
|
||||
downloaded package in the same directory as this readme. Otherwise,
|
||||
included in the download are two
|
||||
English taggers, and the other taggers may be downloaded from the
|
||||
website. All taggers are accompanied by the props files used to create
|
||||
them; please examine these files for more detailed information about the
|
||||
creation of the taggers.
|
||||
|
||||
For English, the bidirectional taggers are slightly more accurate, but
|
||||
tag much more slowly; choose the appropriate tagger based on your
|
||||
speed/performance needs.
|
||||
|
||||
English taggers
|
||||
---------------------------
|
||||
wsj-0-18-bidirectional-distsim.tagger
|
||||
Trained on WSJ sections 0-18 using a bidirectional architecture and
|
||||
including word shape and distributional similarity features.
|
||||
Penn Treebank tagset.
|
||||
Performance:
|
||||
97.28% correct on WSJ 19-21
|
||||
(90.46% correct on unknown words)
|
||||
|
||||
wsj-0-18-left3words.tagger
|
||||
Trained on WSJ sections 0-18 using the left3words architecture and
|
||||
includes word shape features. Penn tagset.
|
||||
Performance:
|
||||
96.97% correct on WSJ 19-21
|
||||
(88.85% correct on unknown words)
|
||||
|
||||
wsj-0-18-left3words-distsim.tagger
|
||||
Trained on WSJ sections 0-18 using the left3words architecture and
|
||||
includes word shape and distributional similarity features. Penn tagset.
|
||||
Performance:
|
||||
97.01% correct on WSJ 19-21
|
||||
(89.81% correct on unknown words)
|
||||
|
||||
english-left3words-distsim.tagger
|
||||
Trained on WSJ sections 0-18 and extra parser training data using the
|
||||
left3words architecture and includes word shape and distributional
|
||||
similarity features. Penn tagset.
|
||||
|
||||
english-bidirectional-distsim.tagger
|
||||
Trained on WSJ sections 0-18 using a bidirectional architecture and
|
||||
including word shape and distributional similarity features.
|
||||
Penn Treebank tagset.
|
||||
|
||||
wsj-0-18-caseless-left3words-distsim.tagger
|
||||
Trained on WSJ sections 0-18 left3words architecture and includes word
|
||||
shape and distributional similarity features. Penn tagset. Ignores case.
|
||||
|
||||
english-caseless-left3words-distsim.tagger
|
||||
Trained on WSJ sections 0-18 and extra parser training data using the
|
||||
left3words architecture and includes word shape and distributional
|
||||
similarity features. Penn tagset. Ignores case.
|
||||
|
||||
|
||||
Chinese tagger
|
||||
---------------------------
|
||||
chinese-nodistsim.tagger
|
||||
Trained on a combination of CTB7 texts from Chinese and Hong Kong
|
||||
sources.
|
||||
LDC Chinese Treebank POS tag set.
|
||||
Performance:
|
||||
93.46% on a combination of Chinese and Hong Kong texts
|
||||
(79.40% on unknown words)
|
||||
|
||||
chinese-distsim.tagger
|
||||
Trained on a combination of CTB7 texts from Chinese and Hong Kong
|
||||
sources with distributional similarity clusters.
|
||||
LDC Chinese Treebank POS tag set.
|
||||
Performance:
|
||||
93.99% on a combination of Chinese and Hong Kong texts
|
||||
(84.60% on unknown words)
|
||||
|
||||
Arabic tagger
|
||||
---------------------------
|
||||
arabic.tagger
|
||||
Trained on the *entire* ATB p1-3.
|
||||
When trained on the train part of the ATB p1-3 split done for the 2005
|
||||
JHU Summer Workshop (Diab split), using (augmented) Bies tags, it gets
|
||||
the following performance:
|
||||
96.26% on test portion according to Diab split
|
||||
(80.14% on unknown words)
|
||||
|
||||
French tagger
|
||||
---------------------------
|
||||
french.tagger
|
||||
Trained on the French treebank.
|
||||
|
||||
German tagger
|
||||
---------------------------
|
||||
german-hgc.tagger
|
||||
Trained on the first 80% of the Negra corpus, which uses the STTS tagset.
|
||||
The Stuttgart-Tübingen Tagset (STTS) is a set of 54 tags for annotating
|
||||
German text corpora with part-of-speech labels, which was jointly
|
||||
developed by the Institut für maschinelle Sprachverarbeitung of the
|
||||
University of Stuttgart and the Seminar für Sprachwissenschaft of the
|
||||
University of Tübingen. See:
|
||||
http://www.ims.uni-stuttgart.de/projekte/CQPDemos/Bundestag/help-tagset.html
|
||||
This model uses features from the distributional similarity clusters
|
||||
built over the HGC.
|
||||
Performance:
|
||||
96.90% on the first half of the remaining 20% of the Negra corpus (dev set)
|
||||
(90.33% on unknown words)
|
||||
|
||||
german-dewac.tagger
|
||||
This model uses features from the distributional similarity clusters
|
||||
built from the deWac web corpus.
|
||||
|
||||
german-fast.tagger
|
||||
Lacks distributional similarity features, but is several times faster
|
||||
than the other alternatives.
|
||||
Performance:
|
||||
96.61% overall / 86.72% unknown.
|
||||
BIN
stanford-postagger-full-2014-10-26/models/german-fast.tagger
Normal file
BIN
stanford-postagger-full-2014-10-26/models/german-fast.tagger
Normal file
Binary file not shown.
|
|
@ -0,0 +1,35 @@
|
|||
## tagger training invoked at Tue Jul 08 16:25:43 PDT 2014 with arguments:
|
||||
model = german-fast.tagger
|
||||
arch = words(-2,2),order(1),prefix(1),suffix(10)
|
||||
wordFunction =
|
||||
trainFile = format=TREES,trf=edu.stanford.nlp.trees.international.negra.NegraPennTreeReaderFactory,/u/nlp/data/GermanACL08/negra/negra_1.mrg
|
||||
closedClassTags =
|
||||
closedClassTagThreshold = 40
|
||||
curWordMinFeatureThresh = 2
|
||||
debug = false
|
||||
debugPrefix =
|
||||
tagSeparator = _
|
||||
encoding = iso-8859-1
|
||||
iterations = 100
|
||||
lang = german
|
||||
learnClosedClassTags = false
|
||||
minFeatureThresh = 2
|
||||
openClassTags =
|
||||
rareWordMinFeatureThresh = 10
|
||||
rareWordThresh = 5
|
||||
search = owlqn
|
||||
sgml = false
|
||||
sigmaSquared = 0.0
|
||||
regL1 = 0.75
|
||||
tagInside =
|
||||
tokenize = true
|
||||
tokenizerFactory =
|
||||
tokenizerOptions = asciiQuotes
|
||||
verbose = false
|
||||
verboseResults = true
|
||||
veryCommonWordThresh = 250
|
||||
xmlInput = null
|
||||
outputFile =
|
||||
outputFormat = slashTags
|
||||
outputFormatOptions =
|
||||
nthreads = 1
|
||||
6
stanford-postagger-full-2014-10-26/sample-input.txt
Normal file
6
stanford-postagger-full-2014-10-26/sample-input.txt
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
A passenger plane has crashed shortly after take-off from Kyrgyzstan's
|
||||
capital, Bishkek, killing a large number of those on board. The head of
|
||||
Kyrgyzstan's civil aviation authority said that out of about 90
|
||||
passengers and crew, only about 20 people have survived. The Itek Air
|
||||
Boeing 737 took off bound for Mashhad, in north-eastern Iran, but turned
|
||||
round some 10 minutes later.
|
||||
3
stanford-postagger-full-2014-10-26/sample-output.txt
Normal file
3
stanford-postagger-full-2014-10-26/sample-output.txt
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
A_DT passenger_NN plane_NN has_VBZ crashed_VBN shortly_RB after_IN take-off_NN from_IN Kyrgyzstan_NNP 's_POS capital_NN ,_, Bishkek_NNP ,_, killing_VBG a_DT large_JJ number_NN of_IN those_DT on_IN board_NN ._.
|
||||
The_DT head_NN of_IN Kyrgyzstan_NNP 's_POS civil_JJ aviation_NN authority_NN said_VBD that_IN out_IN of_IN about_IN 90_CD passengers_NNS and_CC crew_NN ,_, only_RB about_IN 20_CD people_NNS have_VBP survived_VBN ._.
|
||||
The_DT Itek_NNP Air_NNP Boeing_NNP 737_CD took_VBD off_RP bound_VBN for_IN Mashhad_NNP ,_, in_IN north-eastern_JJ Iran_NNP ,_, but_CC turned_VBD round_NN some_DT 10_CD minutes_NNS later_RB ._.
|
||||
Binary file not shown.
Binary file not shown.
BIN
stanford-postagger-full-2014-10-26/stanford-postagger-3.5.0.jar
Normal file
BIN
stanford-postagger-full-2014-10-26/stanford-postagger-3.5.0.jar
Normal file
Binary file not shown.
|
|
@ -0,0 +1,3 @@
|
|||
:: runs the POS tagger (toy) GUI
|
||||
:: usage stanford-postagger-gui
|
||||
java -mx200m -cp "stanford-postagger.jar;" edu.stanford.nlp.tagger.maxent.MaxentTaggerGUI
|
||||
2
stanford-postagger-full-2014-10-26/stanford-postagger-gui.sh
Executable file
2
stanford-postagger-full-2014-10-26/stanford-postagger-gui.sh
Executable file
|
|
@ -0,0 +1,2 @@
|
|||
#!/bin/sh
|
||||
java -mx200m -cp 'stanford-postagger.jar:' edu.stanford.nlp.tagger.maxent.MaxentTaggerGUI
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
:: usage: stanford-postagger model textFile
|
||||
:: e.g., stanford-postagger models\english-left3words-distsim.tagger sample-input.txt
|
||||
|
||||
java -mx300m -cp "stanford-postagger.jar;" edu.stanford.nlp.tagger.maxent.MaxentTagger -model %1 -textFile %2
|
||||
BIN
stanford-postagger-full-2014-10-26/stanford-postagger.jar
Normal file
BIN
stanford-postagger-full-2014-10-26/stanford-postagger.jar
Normal file
Binary file not shown.
6
stanford-postagger-full-2014-10-26/stanford-postagger.sh
Executable file
6
stanford-postagger-full-2014-10-26/stanford-postagger.sh
Executable file
|
|
@ -0,0 +1,6 @@
|
|||
#!/bin/sh
|
||||
#
|
||||
# usage: ./stanford-postagger.sh model textFile
|
||||
# e.g., ./stanford-postagger.sh models/english-left3words-distsim.tagger sample-input.txt
|
||||
|
||||
java -mx300m -cp 'stanford-postagger.jar:' edu.stanford.nlp.tagger.maxent.MaxentTagger -model $1 -textFile $2
|
||||
17
tagger.py
Normal file
17
tagger.py
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
from nltk.tag.stanford import POSTagger
|
||||
from models import Article
|
||||
from analyze import *
|
||||
|
||||
tagger = POSTagger('./stanford-postagger-full-2014-10-26/models/german-fast.tagger',
|
||||
'./stanford-postagger-full-2014-10-26/stanford-postagger-3.5.0.jar',
|
||||
'UTF-8')
|
||||
|
||||
for article in Article.select().limit(100):
|
||||
potential = get_potential_places(article.place, article.description)
|
||||
places = improve_potential_places(potential)
|
||||
|
||||
print(article.place)
|
||||
print(article.description)
|
||||
print()
|
||||
print("Potential: " + str(potential))
|
||||
print("Improved: " + str(places))
|
||||
Loading…
Add table
Add a link
Reference in a new issue