diff options
166 files changed, 9130 insertions, 3827 deletions
diff --git a/ext/ply/ANNOUNCE b/ext/ply/ANNOUNCE index f40902021..0a155cec3 100644 --- a/ext/ply/ANNOUNCE +++ b/ext/ply/ANNOUNCE @@ -1,12 +1,13 @@ -February 19, 2007 +March 24, 2009 - Announcing : PLY-2.3 (Python Lex-Yacc) + Announcing : PLY-3.2 (Python Lex-Yacc) http://www.dabeaz.com/ply I'm pleased to announce a significant new update to PLY---a 100% Python -implementation of the common parsing tools lex and yacc. PLY-2.3 is -a minor bug fix release, but also features improved performance. +implementation of the common parsing tools lex and yacc. PLY-3.2 adds +compatibility for Python 2.6 and 3.0, provides some new customization +options, and cleans up a lot of internal implementation details. If you are new to PLY, here are a few highlights: @@ -29,19 +30,11 @@ If you are new to PLY, here are a few highlights: problems. Currently, PLY can build its parsing tables using either SLR or LALR(1) algorithms. -- PLY can be used to build parsers for large programming languages. - Although it is not ultra-fast due to its Python implementation, - PLY can be used to parse grammars consisting of several hundred - rules (as might be found for a language like C). The lexer and LR - parser are also reasonably efficient when parsing normal - sized programs. - More information about PLY can be obtained on the PLY webpage at: http://www.dabeaz.com/ply -PLY is freely available and is licensed under the terms of the Lesser -GNU Public License (LGPL). +PLY is freely available. Cheers, diff --git a/ext/ply/CHANGES b/ext/ply/CHANGES index d88f3e5d6..9d8b25d5a 100644 --- a/ext/ply/CHANGES +++ b/ext/ply/CHANGES @@ -1,3 +1,335 @@ + +Version 3.2 +----------------------------- +03/24/09: beazley + Added an extra check to not print duplicated warning messages + about reduce/reduce conflicts. + +03/24/09: beazley + Switched PLY over to a BSD-license. + +03/23/09: beazley + Performance optimization. Discovered a few places to make + speedups in LR table generation. + +03/23/09: beazley + New warning message. PLY now warns about rules never + reduced due to reduce/reduce conflicts. Suggested by + Bruce Frederiksen. + +03/23/09: beazley + Some clean-up of warning messages related to reduce/reduce errors. + +03/23/09: beazley + Added a new picklefile option to yacc() to write the parsing + tables to a filename using the pickle module. Here is how + it works: + + yacc(picklefile="parsetab.p") + + This option can be used if the normal parsetab.py file is + extremely large. For example, on jython, it is impossible + to read parsing tables if the parsetab.py exceeds a certain + threshold. + + The filename supplied to the picklefile option is opened + relative to the current working directory of the Python + interpreter. If you need to refer to the file elsewhere, + you will need to supply an absolute or relative path. + + For maximum portability, the pickle file is written + using protocol 0. + +03/13/09: beazley + Fixed a bug in parser.out generation where the rule numbers + where off by one. + +03/13/09: beazley + Fixed a string formatting bug with one of the error messages. + Reported by Richard Reitmeyer + +Version 3.1 +----------------------------- +02/28/09: beazley + Fixed broken start argument to yacc(). PLY-3.0 broke this + feature by accident. + +02/28/09: beazley + Fixed debugging output. yacc() no longer reports shift/reduce + or reduce/reduce conflicts if debugging is turned off. This + restores similar behavior in PLY-2.5. Reported by Andrew Waters. + +Version 3.0 +----------------------------- +02/03/09: beazley + Fixed missing lexer attribute on certain tokens when + invoking the parser p_error() function. Reported by + Bart Whiteley. + +02/02/09: beazley + The lex() command now does all error-reporting and diagonistics + using the logging module interface. Pass in a Logger object + using the errorlog parameter to specify a different logger. + +02/02/09: beazley + Refactored ply.lex to use a more object-oriented and organized + approach to collecting lexer information. + +02/01/09: beazley + Removed the nowarn option from lex(). All output is controlled + by passing in a logger object. Just pass in a logger with a high + level setting to suppress output. This argument was never + documented to begin with so hopefully no one was relying upon it. + +02/01/09: beazley + Discovered and removed a dead if-statement in the lexer. This + resulted in a 6-7% speedup in lexing when I tested it. + +01/13/09: beazley + Minor change to the procedure for signalling a syntax error in a + production rule. A normal SyntaxError exception should be raised + instead of yacc.SyntaxError. + +01/13/09: beazley + Added a new method p.set_lineno(n,lineno) that can be used to set the + line number of symbol n in grammar rules. This simplifies manual + tracking of line numbers. + +01/11/09: beazley + Vastly improved debugging support for yacc.parse(). Instead of passing + debug as an integer, you can supply a Logging object (see the logging + module). Messages will be generated at the ERROR, INFO, and DEBUG + logging levels, each level providing progressively more information. + The debugging trace also shows states, grammar rule, values passed + into grammar rules, and the result of each reduction. + +01/09/09: beazley + The yacc() command now does all error-reporting and diagnostics using + the interface of the logging module. Use the errorlog parameter to + specify a logging object for error messages. Use the debuglog parameter + to specify a logging object for the 'parser.out' output. + +01/09/09: beazley + *HUGE* refactoring of the the ply.yacc() implementation. The high-level + user interface is backwards compatible, but the internals are completely + reorganized into classes. No more global variables. The internals + are also more extensible. For example, you can use the classes to + construct a LALR(1) parser in an entirely different manner than + what is currently the case. Documentation is forthcoming. + +01/07/09: beazley + Various cleanup and refactoring of yacc internals. + +01/06/09: beazley + Fixed a bug with precedence assignment. yacc was assigning the precedence + each rule based on the left-most token, when in fact, it should have been + using the right-most token. Reported by Bruce Frederiksen. + +11/27/08: beazley + Numerous changes to support Python 3.0 including removal of deprecated + statements (e.g., has_key) and the additional of compatibility code + to emulate features from Python 2 that have been removed, but which + are needed. Fixed the unit testing suite to work with Python 3.0. + The code should be backwards compatible with Python 2. + +11/26/08: beazley + Loosened the rules on what kind of objects can be passed in as the + "module" parameter to lex() and yacc(). Previously, you could only use + a module or an instance. Now, PLY just uses dir() to get a list of + symbols on whatever the object is without regard for its type. + +11/26/08: beazley + Changed all except: statements to be compatible with Python2.x/3.x syntax. + +11/26/08: beazley + Changed all raise Exception, value statements to raise Exception(value) for + forward compatibility. + +11/26/08: beazley + Removed all print statements from lex and yacc, using sys.stdout and sys.stderr + directly. Preparation for Python 3.0 support. + +11/04/08: beazley + Fixed a bug with referring to symbols on the the parsing stack using negative + indices. + +05/29/08: beazley + Completely revamped the testing system to use the unittest module for everything. + Added additional tests to cover new errors/warnings. + +Version 2.5 +----------------------------- +05/28/08: beazley + Fixed a bug with writing lex-tables in optimized mode and start states. + Reported by Kevin Henry. + +Version 2.4 +----------------------------- +05/04/08: beazley + A version number is now embedded in the table file signature so that + yacc can more gracefully accomodate changes to the output format + in the future. + +05/04/08: beazley + Removed undocumented .pushback() method on grammar productions. I'm + not sure this ever worked and can't recall ever using it. Might have + been an abandoned idea that never really got fleshed out. This + feature was never described or tested so removing it is hopefully + harmless. + +05/04/08: beazley + Added extra error checking to yacc() to detect precedence rules defined + for undefined terminal symbols. This allows yacc() to detect a potential + problem that can be really tricky to debug if no warning message or error + message is generated about it. + +05/04/08: beazley + lex() now has an outputdir that can specify the output directory for + tables when running in optimize mode. For example: + + lexer = lex.lex(optimize=True, lextab="ltab", outputdir="foo/bar") + + The behavior of specifying a table module and output directory are + more aligned with the behavior of yacc(). + +05/04/08: beazley + [Issue 9] + Fixed filename bug in when specifying the modulename in lex() and yacc(). + If you specified options such as the following: + + parser = yacc.yacc(tabmodule="foo.bar.parsetab",outputdir="foo/bar") + + yacc would create a file "foo.bar.parsetab.py" in the given directory. + Now, it simply generates a file "parsetab.py" in that directory. + Bug reported by cptbinho. + +05/04/08: beazley + Slight modification to lex() and yacc() to allow their table files + to be loaded from a previously loaded module. This might make + it easier to load the parsing tables from a complicated package + structure. For example: + + import foo.bar.spam.parsetab as parsetab + parser = yacc.yacc(tabmodule=parsetab) + + Note: lex and yacc will never regenerate the table file if used + in the form---you will get a warning message instead. + This idea suggested by Brian Clapper. + + +04/28/08: beazley + Fixed a big with p_error() functions being picked up correctly + when running in yacc(optimize=1) mode. Patch contributed by + Bart Whiteley. + +02/28/08: beazley + Fixed a bug with 'nonassoc' precedence rules. Basically the + non-precedence was being ignored and not producing the correct + run-time behavior in the parser. + +02/16/08: beazley + Slight relaxation of what the input() method to a lexer will + accept as a string. Instead of testing the input to see + if the input is a string or unicode string, it checks to see + if the input object looks like it contains string data. + This change makes it possible to pass string-like objects + in as input. For example, the object returned by mmap. + + import mmap, os + data = mmap.mmap(os.open(filename,os.O_RDONLY), + os.path.getsize(filename), + access=mmap.ACCESS_READ) + lexer.input(data) + + +11/29/07: beazley + Modification of ply.lex to allow token functions to aliased. + This is subtle, but it makes it easier to create libraries and + to reuse token specifications. For example, suppose you defined + a function like this: + + def number(t): + r'\d+' + t.value = int(t.value) + return t + + This change would allow you to define a token rule as follows: + + t_NUMBER = number + + In this case, the token type will be set to 'NUMBER' and use + the associated number() function to process tokens. + +11/28/07: beazley + Slight modification to lex and yacc to grab symbols from both + the local and global dictionaries of the caller. This + modification allows lexers and parsers to be defined using + inner functions and closures. + +11/28/07: beazley + Performance optimization: The lexer.lexmatch and t.lexer + attributes are no longer set for lexer tokens that are not + defined by functions. The only normal use of these attributes + would be in lexer rules that need to perform some kind of + special processing. Thus, it doesn't make any sense to set + them on every token. + + *** POTENTIAL INCOMPATIBILITY *** This might break code + that is mucking around with internal lexer state in some + sort of magical way. + +11/27/07: beazley + Added the ability to put the parser into error-handling mode + from within a normal production. To do this, simply raise + a yacc.SyntaxError exception like this: + + def p_some_production(p): + 'some_production : prod1 prod2' + ... + raise yacc.SyntaxError # Signal an error + + A number of things happen after this occurs: + + - The last symbol shifted onto the symbol stack is discarded + and parser state backed up to what it was before the + the rule reduction. + + - The current lookahead symbol is saved and replaced by + the 'error' symbol. + + - The parser enters error recovery mode where it tries + to either reduce the 'error' rule or it starts + discarding items off of the stack until the parser + resets. + + When an error is manually set, the parser does *not* call + the p_error() function (if any is defined). + *** NEW FEATURE *** Suggested on the mailing list + +11/27/07: beazley + Fixed structure bug in examples/ansic. Reported by Dion Blazakis. + +11/27/07: beazley + Fixed a bug in the lexer related to start conditions and ignored + token rules. If a rule was defined that changed state, but + returned no token, the lexer could be left in an inconsistent + state. Reported by + +11/27/07: beazley + Modified setup.py to support Python Eggs. Patch contributed by + Simon Cross. + +11/09/07: beazely + Fixed a bug in error handling in yacc. If a syntax error occurred and the + parser rolled the entire parse stack back, the parser would be left in in + inconsistent state that would cause it to trigger incorrect actions on + subsequent input. Reported by Ton Biegstraaten, Justin King, and others. + +11/09/07: beazley + Fixed a bug when passing empty input strings to yacc.parse(). This + would result in an error message about "No input given". Reported + by Andrew Dalke. + Version 2.3 ----------------------------- 02/20/07: beazley diff --git a/ext/ply/COPYING b/ext/ply/COPYING deleted file mode 100644 index b1e3f5a26..000000000 --- a/ext/ply/COPYING +++ /dev/null @@ -1,504 +0,0 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 2.1, February 1999 - - Copyright (C) 1991, 1999 Free Software Foundation, Inc. - 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - -[This is the first released version of the Lesser GPL. It also counts - as the successor of the GNU Library Public License, version 2, hence - the version number 2.1.] - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -Licenses are intended to guarantee your freedom to share and change -free software--to make sure the software is free for all its users. - - This license, the Lesser General Public License, applies to some -specially designated software packages--typically libraries--of the -Free Software Foundation and other authors who decide to use it. You -can use it too, but we suggest you first think carefully about whether -this license or the ordinary General Public License is the better -strategy to use in any particular case, based on the explanations below. - - When we speak of free software, we are referring to freedom of use, -not price. Our General Public Licenses are designed to make sure that -you have the freedom to distribute copies of free software (and charge -for this service if you wish); that you receive source code or can get -it if you want it; that you can change the software and use pieces of -it in new free programs; and that you are informed that you can do -these things. - - To protect your rights, we need to make restrictions that forbid -distributors to deny you these rights or to ask you to surrender these -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the library or if you modify it. - - For example, if you distribute copies of the library, whether gratis -or for a fee, you must give the recipients all the rights that we gave -you. You must make sure that they, too, receive or can get the source -code. If you link other code with the library, you must provide -complete object files to the recipients, so that they can relink them -with the library after making changes to the library and recompiling -it. And you must show them these terms so they know their rights. - - We protect your rights with a two-step method: (1) we copyright the -library, and (2) we offer you this license, which gives you legal -permission to copy, distribute and/or modify the library. - - To protect each distributor, we want to make it very clear that -there is no warranty for the free library. Also, if the library is -modified by someone else and passed on, the recipients should know -that what they have is not the original version, so that the original -author's reputation will not be affected by problems that might be -introduced by others. - - Finally, software patents pose a constant threat to the existence of -any free program. We wish to make sure that a company cannot -effectively restrict the users of a free program by obtaining a -restrictive license from a patent holder. Therefore, we insist that -any patent license obtained for a version of the library must be -consistent with the full freedom of use specified in this license. - - Most GNU software, including some libraries, is covered by the -ordinary GNU General Public License. This license, the GNU Lesser -General Public License, applies to certain designated libraries, and -is quite different from the ordinary General Public License. We use -this license for certain libraries in order to permit linking those -libraries into non-free programs. - - When a program is linked with a library, whether statically or using -a shared library, the combination of the two is legally speaking a -combined work, a derivative of the original library. The ordinary -General Public License therefore permits such linking only if the -entire combination fits its criteria of freedom. The Lesser General -Public License permits more lax criteria for linking other code with -the library. - - We call this license the "Lesser" General Public License because it -does Less to protect the user's freedom than the ordinary General -Public License. It also provides other free software developers Less -of an advantage over competing non-free programs. These disadvantages -are the reason we use the ordinary General Public License for many -libraries. However, the Lesser license provides advantages in certain -special circumstances. - - For example, on rare occasions, there may be a special need to -encourage the widest possible use of a certain library, so that it becomes -a de-facto standard. To achieve this, non-free programs must be -allowed to use the library. A more frequent case is that a free -library does the same job as widely used non-free libraries. In this -case, there is little to gain by limiting the free library to free -software only, so we use the Lesser General Public License. - - In other cases, permission to use a particular library in non-free -programs enables a greater number of people to use a large body of -free software. For example, permission to use the GNU C Library in -non-free programs enables many more people to use the whole GNU -operating system, as well as its variant, the GNU/Linux operating -system. - - Although the Lesser General Public License is Less protective of the -users' freedom, it does ensure that the user of a program that is -linked with the Library has the freedom and the wherewithal to run -that program using a modified version of the Library. - - The precise terms and conditions for copying, distribution and -modification follow. Pay close attention to the difference between a -"work based on the library" and a "work that uses the library". The -former contains code derived from the library, whereas the latter must -be combined with the library in order to run. - - GNU LESSER GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License Agreement applies to any software library or other -program which contains a notice placed by the copyright holder or -other authorized party saying it may be distributed under the terms of -this Lesser General Public License (also called "this License"). -Each licensee is addressed as "you". - - A "library" means a collection of software functions and/or data -prepared so as to be conveniently linked with application programs -(which use some of those functions and data) to form executables. - - The "Library", below, refers to any such software library or work -which has been distributed under these terms. A "work based on the -Library" means either the Library or any derivative work under -copyright law: that is to say, a work containing the Library or a -portion of it, either verbatim or with modifications and/or translated -straightforwardly into another language. (Hereinafter, translation is -included without limitation in the term "modification".) - - "Source code" for a work means the preferred form of the work for -making modifications to it. For a library, complete source code means -all the source code for all modules it contains, plus any associated -interface definition files, plus the scripts used to control compilation -and installation of the library. - - Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running a program using the Library is not restricted, and output from -such a program is covered only if its contents constitute a work based -on the Library (independent of the use of the Library in a tool for -writing it). Whether that is true depends on what the Library does -and what the program that uses the Library does. - - 1. You may copy and distribute verbatim copies of the Library's -complete source code as you receive it, in any medium, provided that -you conspicuously and appropriately publish on each copy an -appropriate copyright notice and disclaimer of warranty; keep intact -all the notices that refer to this License and to the absence of any -warranty; and distribute a copy of this License along with the -Library. - - You may charge a fee for the physical act of transferring a copy, -and you may at your option offer warranty protection in exchange for a -fee. - - 2. You may modify your copy or copies of the Library or any portion -of it, thus forming a work based on the Library, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) The modified work must itself be a software library. - - b) You must cause the files modified to carry prominent notices - stating that you changed the files and the date of any change. - - c) You must cause the whole of the work to be licensed at no - charge to all third parties under the terms of this License. - - d) If a facility in the modified Library refers to a function or a - table of data to be supplied by an application program that uses - the facility, other than as an argument passed when the facility - is invoked, then you must make a good faith effort to ensure that, - in the event an application does not supply such function or - table, the facility still operates, and performs whatever part of - its purpose remains meaningful. - - (For example, a function in a library to compute square roots has - a purpose that is entirely well-defined independent of the - application. Therefore, Subsection 2d requires that any - application-supplied function or table used by this function must - be optional: if the application does not supply it, the square - root function must still compute square roots.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Library, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Library, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Library. - -In addition, mere aggregation of another work not based on the Library -with the Library (or with a work based on the Library) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may opt to apply the terms of the ordinary GNU General Public -License instead of this License to a given copy of the Library. To do -this, you must alter all the notices that refer to this License, so -that they refer to the ordinary GNU General Public License, version 2, -instead of to this License. (If a newer version than version 2 of the -ordinary GNU General Public License has appeared, then you can specify -that version instead if you wish.) Do not make any other change in -these notices. - - Once this change is made in a given copy, it is irreversible for -that copy, so the ordinary GNU General Public License applies to all -subsequent copies and derivative works made from that copy. - - This option is useful when you wish to copy part of the code of -the Library into a program that is not a library. - - 4. You may copy and distribute the Library (or a portion or -derivative of it, under Section 2) in object code or executable form -under the terms of Sections 1 and 2 above provided that you accompany -it with the complete corresponding machine-readable source code, which -must be distributed under the terms of Sections 1 and 2 above on a -medium customarily used for software interchange. - - If distribution of object code is made by offering access to copy -from a designated place, then offering equivalent access to copy the -source code from the same place satisfies the requirement to -distribute the source code, even though third parties are not -compelled to copy the source along with the object code. - - 5. A program that contains no derivative of any portion of the -Library, but is designed to work with the Library by being compiled or -linked with it, is called a "work that uses the Library". Such a -work, in isolation, is not a derivative work of the Library, and -therefore falls outside the scope of this License. - - However, linking a "work that uses the Library" with the Library -creates an executable that is a derivative of the Library (because it -contains portions of the Library), rather than a "work that uses the -library". The executable is therefore covered by this License. -Section 6 states terms for distribution of such executables. - - When a "work that uses the Library" uses material from a header file -that is part of the Library, the object code for the work may be a -derivative work of the Library even though the source code is not. -Whether this is true is especially significant if the work can be -linked without the Library, or if the work is itself a library. The -threshold for this to be true is not precisely defined by law. - - If such an object file uses only numerical parameters, data -structure layouts and accessors, and small macros and small inline -functions (ten lines or less in length), then the use of the object -file is unrestricted, regardless of whether it is legally a derivative -work. (Executables containing this object code plus portions of the -Library will still fall under Section 6.) - - Otherwise, if the work is a derivative of the Library, you may -distribute the object code for the work under the terms of Section 6. -Any executables containing that work also fall under Section 6, -whether or not they are linked directly with the Library itself. - - 6. As an exception to the Sections above, you may also combine or -link a "work that uses the Library" with the Library to produce a -work containing portions of the Library, and distribute that work -under terms of your choice, provided that the terms permit -modification of the work for the customer's own use and reverse -engineering for debugging such modifications. - - You must give prominent notice with each copy of the work that the -Library is used in it and that the Library and its use are covered by -this License. You must supply a copy of this License. If the work -during execution displays copyright notices, you must include the -copyright notice for the Library among them, as well as a reference -directing the user to the copy of this License. Also, you must do one -of these things: - - a) Accompany the work with the complete corresponding - machine-readable source code for the Library including whatever - changes were used in the work (which must be distributed under - Sections 1 and 2 above); and, if the work is an executable linked - with the Library, with the complete machine-readable "work that - uses the Library", as object code and/or source code, so that the - user can modify the Library and then relink to produce a modified - executable containing the modified Library. (It is understood - that the user who changes the contents of definitions files in the - Library will not necessarily be able to recompile the application - to use the modified definitions.) - - b) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (1) uses at run time a - copy of the library already present on the user's computer system, - rather than copying library functions into the executable, and (2) - will operate properly with a modified version of the library, if - the user installs one, as long as the modified version is - interface-compatible with the version that the work was made with. - - c) Accompany the work with a written offer, valid for at - least three years, to give the same user the materials - specified in Subsection 6a, above, for a charge no more - than the cost of performing this distribution. - - d) If distribution of the work is made by offering access to copy - from a designated place, offer equivalent access to copy the above - specified materials from the same place. - - e) Verify that the user has already received a copy of these - materials or that you have already sent this user a copy. - - For an executable, the required form of the "work that uses the -Library" must include any data and utility programs needed for -reproducing the executable from it. However, as a special exception, -the materials to be distributed need not include anything that is -normally distributed (in either source or binary form) with the major -components (compiler, kernel, and so on) of the operating system on -which the executable runs, unless that component itself accompanies -the executable. - - It may happen that this requirement contradicts the license -restrictions of other proprietary libraries that do not normally -accompany the operating system. Such a contradiction means you cannot -use both them and the Library together in an executable that you -distribute. - - 7. You may place library facilities that are a work based on the -Library side-by-side in a single library together with other library -facilities not covered by this License, and distribute such a combined -library, provided that the separate distribution of the work based on -the Library and of the other library facilities is otherwise -permitted, and provided that you do these two things: - - a) Accompany the combined library with a copy of the same work - based on the Library, uncombined with any other library - facilities. This must be distributed under the terms of the - Sections above. - - b) Give prominent notice with the combined library of the fact - that part of it is a work based on the Library, and explaining - where to find the accompanying uncombined form of the same work. - - 8. You may not copy, modify, sublicense, link with, or distribute -the Library except as expressly provided under this License. Any -attempt otherwise to copy, modify, sublicense, link with, or -distribute the Library is void, and will automatically terminate your -rights under this License. However, parties who have received copies, -or rights, from you under this License will not have their licenses -terminated so long as such parties remain in full compliance. - - 9. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Library or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Library (or any work based on the -Library), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Library or works based on it. - - 10. Each time you redistribute the Library (or any work based on the -Library), the recipient automatically receives a license from the -original licensor to copy, distribute, link with or modify the Library -subject to these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties with -this License. - - 11. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Library at all. For example, if a patent -license would not permit royalty-free redistribution of the Library by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Library. - -If any portion of this section is held invalid or unenforceable under any -particular circumstance, the balance of the section is intended to apply, -and the section as a whole is intended to apply in other circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 12. If the distribution and/or use of the Library is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Library under this License may add -an explicit geographical distribution limitation excluding those countries, -so that distribution is permitted only in or among countries not thus -excluded. In such case, this License incorporates the limitation as if -written in the body of this License. - - 13. The Free Software Foundation may publish revised and/or new -versions of the Lesser General Public License from time to time. -Such new versions will be similar in spirit to the present version, -but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Library -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Library does not specify a -license version number, you may choose any version ever published by -the Free Software Foundation. - - 14. If you wish to incorporate parts of the Library into other free -programs whose distribution conditions are incompatible with these, -write to the author to ask for permission. For software which is -copyrighted by the Free Software Foundation, write to the Free -Software Foundation; we sometimes make exceptions for this. Our -decision will be guided by the two goals of preserving the free status -of all derivatives of our free software and of promoting the sharing -and reuse of software generally. - - NO WARRANTY - - 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. -EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR -OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY -KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Libraries - - If you develop a new library, and you want it to be of the greatest -possible use to the public, we recommend making it free software that -everyone can redistribute and change. You can do so by permitting -redistribution under these terms (or, alternatively, under the terms of the -ordinary General Public License). - - To apply these terms, attach the following notices to the library. It is -safest to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least the -"copyright" line and a pointer to where the full notice is found. - - <one line to give the library's name and a brief idea of what it does.> - Copyright (C) <year> <name of author> - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -Also add information on how to contact you by electronic and paper mail. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the library, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the - library `Frob' (a library for tweaking knobs) written by James Random Hacker. - - <signature of Ty Coon>, 1 April 1990 - Ty Coon, President of Vice - -That's all there is to it! - - diff --git a/ext/ply/README b/ext/ply/README index 6e246c2bd..d3b785fa2 100644 --- a/ext/ply/README +++ b/ext/ply/README @@ -1,33 +1,41 @@ -PLY (Python Lex-Yacc) Version 2.3 (February 18, 2007) - -David M. Beazley (dave@dabeaz.com) - -Copyright (C) 2001-2007 David M. Beazley - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -See the file COPYING for a complete copy of the LGPL. +PLY (Python Lex-Yacc) Version 3.2 + +Copyright (C) 2001-2009, +David M. Beazley (Dabeaz LLC) +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +* Neither the name of the David Beazley or Dabeaz LLC may be used to + endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Introduction ============ PLY is a 100% Python implementation of the common parsing tools lex -and yacc. Although several other parsing tools are available for -Python, there are several reasons why you might want to consider PLY: +and yacc. Here are a few highlights: - - The tools are very closely modeled after traditional lex/yacc. + - PLY is very closely modeled after traditional lex/yacc. If you know how to use these tools in C, you will find PLY to be similar. @@ -43,8 +51,8 @@ Python, there are several reasons why you might want to consider PLY: - Parsing is based on LR-parsing which is fast, memory efficient, better suited to large grammars, and which has a number of nice properties when dealing with syntax errors and other parsing problems. - Currently, PLY builds its parsing tables using the SLR algorithm which - is slightly weaker than LALR(1) used in traditional yacc. + Currently, PLY builds its parsing tables using the LALR(1) + algorithm used in yacc. - PLY uses Python introspection features to build lexers and parsers. This greatly simplifies the task of parser construction since it reduces @@ -56,16 +64,8 @@ Python, there are several reasons why you might want to consider PLY: PLY can be used to parse grammars consisting of several hundred rules (as might be found for a language like C). The lexer and LR parser are also reasonably efficient when parsing typically - sized programs. - -The original version of PLY was developed for an Introduction to -Compilers course where students used it to build a compiler for a -simple Pascal-like language. Their compiler had to include lexical -analysis, parsing, type checking, type inference, and generation of -assembly code for the SPARC processor. Because of this, the current -implementation has been extensively tested and debugged. In addition, -most of the API and error checking steps have been adapted to address -common usability problems. + sized programs. People have used PLY to build parsers for + C, C++, ADA, and other real programming languages. How to Use ========== @@ -96,10 +96,10 @@ A simple example is found at the end of this document Requirements ============ -PLY requires the use of Python 2.1 or greater. However, you should +PLY requires the use of Python 2.2 or greater. However, you should use the latest Python release if possible. It should work on just about any platform. PLY has been tested with both CPython and Jython. -However, it does not seem to work with IronPython. +It also seems to work with IronPython. Resources ========= @@ -127,16 +127,13 @@ Elias Ioup did the first implementation of LALR(1) parsing in PLY-1.x. Andrew Waters and Markus Schoepflin were instrumental in reporting bugs and testing a revised LALR(1) implementation for PLY-2.0. -Special Note for PLY-2.x +Special Note for PLY-3.0 ======================== -PLY-2.0 is the first in a series of PLY releases that will be adding a -variety of significant new features. The first release in this series -(Ply-2.0) should be 100% compatible with all previous Ply-1.x releases -except for the fact that Ply-2.0 features a correct implementation of -LALR(1) table generation. - -If you have suggestions for improving PLY in future 2.x releases, please -contact me. - Dave +PLY-3.0 the first PLY release to support Python 3. However, backwards +compatibility with Python 2.2 is still preserved. PLY provides dual +Python 2/3 compatibility by restricting its implementation to a common +subset of basic language features. You should not convert PLY using +2to3--it is not necessary and may in fact break the implementation. Example ======= @@ -169,11 +166,7 @@ t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' def t_NUMBER(t): r'\d+' - try: - t.value = int(t.value) - except ValueError: - print "Integer value too large", t.value - t.value = 0 + t.value = int(t.value) return t # Ignored characters @@ -255,12 +248,12 @@ while 1: Bug Reports and Patches ======================= -Because of the extremely specialized and advanced nature of PLY, I -rarely spend much time working on it unless I receive very specific -bug-reports and/or patches to fix problems. I also try to incorporate -submitted feature requests and enhancements into each new version. To -contact me about bugs and/or new features, please send email to -dave@dabeaz.com. +My goal with PLY is to simply have a decent lex/yacc implementation +for Python. As a general rule, I don't spend huge amounts of time +working on it unless I receive very specific bug reports and/or +patches to fix problems. I also try to incorporate submitted feature +requests and enhancements into each new version. To contact me about +bugs and/or new features, please send email to dave@dabeaz.com. In addition there is a Google group for discussing PLY related issues at diff --git a/ext/ply/TODO b/ext/ply/TODO index 7139d53d1..f4800aacf 100644 --- a/ext/ply/TODO +++ b/ext/ply/TODO @@ -1,14 +1,16 @@ The PLY to-do list: -1. More interesting parsing examples. +1. Finish writing the C Preprocessor module. Started in the + file ply/cpp.py -2. Work on the ANSI C grammar so that it can actually parse C programs. To do this, - some extra code needs to be added to the lexer to deal with typedef names and enumeration - constants. +2. Create and document libraries of useful tokens. -3. More tests in the test directory. +3. Expand the examples/yply tool that parses bison/yacc + files. -4. Performance improvements and cleanup in yacc.py. +4. Think of various diabolical things to do with the + new yacc internals. For example, it is now possible + to specify grammrs using completely different schemes + than the reflection approach used by PLY. -5. More documentation (?). diff --git a/ext/ply/doc/internal.html b/ext/ply/doc/internal.html new file mode 100644 index 000000000..3fabfe28c --- /dev/null +++ b/ext/ply/doc/internal.html @@ -0,0 +1,874 @@ +<html> +<head> +<title>PLY Internals</title> +</head> +<body bgcolor="#ffffff"> + +<h1>PLY Internals</h1> + +<b> +David M. Beazley <br> +dave@dabeaz.com<br> +</b> + +<p> +<b>PLY Version: 3.0</b> +<p> + +<!-- INDEX --> +<div class="sectiontoc"> +<ul> +<li><a href="#internal_nn1">Introduction</a> +<li><a href="#internal_nn2">Grammar Class</a> +<li><a href="#internal_nn3">Productions</a> +<li><a href="#internal_nn4">LRItems</a> +<li><a href="#internal_nn5">LRTable</a> +<li><a href="#internal_nn6">LRGeneratedTable</a> +<li><a href="#internal_nn7">LRParser</a> +<li><a href="#internal_nn8">ParserReflect</a> +<li><a href="#internal_nn9">High-level operation</a> +</ul> +</div> +<!-- INDEX --> + + +<H2><a name="internal_nn1"></a>1. Introduction</H2> + + +This document describes classes and functions that make up the internal +operation of PLY. Using this programming interface, it is possible to +manually build an parser using a different interface specification +than what PLY normally uses. For example, you could build a gramar +from information parsed in a completely different input format. Some of +these objects may be useful for building more advanced parsing engines +such as GLR. + +<p> +It should be stressed that using PLY at this level is not for the +faint of heart. Generally, it's assumed that you know a bit of +the underlying compiler theory and how an LR parser is put together. + +<H2><a name="internal_nn2"></a>2. Grammar Class</H2> + + +The file <tt>ply.yacc</tt> defines a class <tt>Grammar</tt> that +is used to hold and manipulate information about a grammar +specification. It encapsulates the same basic information +about a grammar that is put into a YACC file including +the list of tokens, precedence rules, and grammar rules. +Various operations are provided to perform different validations +on the grammar. In addition, there are operations to compute +the first and follow sets that are needed by the various table +generation algorithms. + +<p> +<tt><b>Grammar(terminals)</b></tt> + +<blockquote> +Creates a new grammar object. <tt>terminals</tt> is a list of strings +specifying the terminals for the grammar. An instance <tt>g</tt> of +<tt>Grammar</tt> has the following methods: +</blockquote> + +<p> +<b><tt>g.set_precedence(term,assoc,level)</tt></b> +<blockquote> +Sets the precedence level and associativity for a given terminal <tt>term</tt>. +<tt>assoc</tt> is one of <tt>'right'</tt>, +<tt>'left'</tt>, or <tt>'nonassoc'</tt> and <tt>level</tt> is a positive integer. The higher +the value of <tt>level</tt>, the higher the precedence. Here is an example of typical +precedence settings: + +<pre> +g.set_precedence('PLUS', 'left',1) +g.set_precedence('MINUS', 'left',1) +g.set_precedence('TIMES', 'left',2) +g.set_precedence('DIVIDE','left',2) +g.set_precedence('UMINUS','left',3) +</pre> + +This method must be called prior to adding any productions to the +grammar with <tt>g.add_production()</tt>. The precedence of individual grammar +rules is determined by the precedence of the right-most terminal. + +</blockquote> +<p> +<b><tt>g.add_production(name,syms,func=None,file='',line=0)</tt></b> +<blockquote> +Adds a new grammar rule. <tt>name</tt> is the name of the rule, +<tt>syms</tt> is a list of symbols making up the right hand +side of the rule, <tt>func</tt> is the function to call when +reducing the rule. <tt>file</tt> and <tt>line</tt> specify +the filename and line number of the rule and are used for +generating error messages. + +<p> +The list of symbols in <tt>syms</tt> may include character +literals and <tt>%prec</tt> specifiers. Here are some +examples: + +<pre> +g.add_production('expr',['expr','PLUS','term'],func,file,line) +g.add_production('expr',['expr','"+"','term'],func,file,line) +g.add_production('expr',['MINUS','expr','%prec','UMINUS'],func,file,line) +</pre> + +<p> +If any kind of error is detected, a <tt>GrammarError</tt> exception +is raised with a message indicating the reason for the failure. +</blockquote> + +<p> +<b><tt>g.set_start(start=None)</tt></b> +<blockquote> +Sets the starting rule for the grammar. <tt>start</tt> is a string +specifying the name of the start rule. If <tt>start</tt> is omitted, +the first grammar rule added with <tt>add_production()</tt> is taken to be +the starting rule. This method must always be called after all +productions have been added. +</blockquote> + +<p> +<b><tt>g.find_unreachable()</tt></b> +<blockquote> +Diagnostic function. Returns a list of all unreachable non-terminals +defined in the grammar. This is used to identify inactive parts of +the grammar specification. +</blockquote> + +<p> +<b><tt>g.infinite_cycle()</tt></b> +<blockquote> +Diagnostic function. Returns a list of all non-terminals in the +grammar that result in an infinite cycle. This condition occurs if +there is no way for a grammar rule to expand to a string containing +only terminal symbols. +</blockquote> + +<p> +<b><tt>g.undefined_symbols()</tt></b> +<blockquote> +Diagnostic function. Returns a list of tuples <tt>(name, prod)</tt> +corresponding to undefined symbols in the grammar. <tt>name</tt> is the +name of the undefined symbol and <tt>prod</tt> is an instance of +<tt>Production</tt> which has information about the production rule +where the undefined symbol was used. +</blockquote> + +<p> +<b><tt>g.unused_terminals()</tt></b> +<blockquote> +Diagnostic function. Returns a list of terminals that were defined, +but never used in the grammar. +</blockquote> + +<p> +<b><tt>g.unused_rules()</tt></b> +<blockquote> +Diagnostic function. Returns a list of <tt>Production</tt> instances +corresponding to production rules that were defined in the grammar, +but never used anywhere. This is slightly different +than <tt>find_unreachable()</tt>. +</blockquote> + +<p> +<b><tt>g.unused_precedence()</tt></b> +<blockquote> +Diagnostic function. Returns a list of tuples <tt>(term, assoc)</tt> +corresponding to precedence rules that were set, but never used the +grammar. <tt>term</tt> is the terminal name and <tt>assoc</tt> is the +precedence associativity (e.g., <tt>'left'</tt>, <tt>'right'</tt>, +or <tt>'nonassoc'</tt>. +</blockquote> + +<p> +<b><tt>g.compute_first()</tt></b> +<blockquote> +Compute all of the first sets for all symbols in the grammar. Returns a dictionary +mapping symbol names to a list of all first symbols. +</blockquote> + +<p> +<b><tt>g.compute_follow()</tt></b> +<blockquote> +Compute all of the follow sets for all non-terminals in the grammar. +The follow set is the set of all possible symbols that might follow a +given non-terminal. Returns a dictionary mapping non-terminal names +to a list of symbols. +</blockquote> + +<p> +<b><tt>g.build_lritems()</tt></b> +<blockquote> +Calculates all of the LR items for all productions in the grammar. This +step is required before using the grammar for any kind of table generation. +See the section on LR items below. +</blockquote> + +<p> +The following attributes are set by the above methods and may be useful +in code that works with the grammar. All of these attributes should be +assumed to be read-only. Changing their values directly will likely +break the grammar. + +<p> +<b><tt>g.Productions</tt></b> +<blockquote> +A list of all productions added. The first entry is reserved for +a production representing the starting rule. The objects in this list +are instances of the <tt>Production</tt> class, described shortly. +</blockquote> + +<p> +<b><tt>g.Prodnames</tt></b> +<blockquote> +A dictionary mapping the names of nonterminals to a list of all +productions of that nonterminal. +</blockquote> + +<p> +<b><tt>g.Terminals</tt></b> +<blockquote> +A dictionary mapping the names of terminals to a list of the +production numbers where they are used. +</blockquote> + +<p> +<b><tt>g.Nonterminals</tt></b> +<blockquote> +A dictionary mapping the names of nonterminals to a list of the +production numbers where they are used. +</blockquote> + +<p> +<b><tt>g.First</tt></b> +<blockquote> +A dictionary representing the first sets for all grammar symbols. This is +computed and returned by the <tt>compute_first()</tt> method. +</blockquote> + +<p> +<b><tt>g.Follow</tt></b> +<blockquote> +A dictionary representing the follow sets for all grammar rules. This is +computed and returned by the <tt>compute_follow()</tt> method. +</blockquote> + +<p> +<b><tt>g.Start</tt></b> +<blockquote> +Starting symbol for the grammar. Set by the <tt>set_start()</tt> method. +</blockquote> + +For the purposes of debugging, a <tt>Grammar</tt> object supports the <tt>__len__()</tt> and +<tt>__getitem__()</tt> special methods. Accessing <tt>g[n]</tt> returns the nth production +from the grammar. + + +<H2><a name="internal_nn3"></a>3. Productions</H2> + + +<tt>Grammar</tt> objects store grammar rules as instances of a <tt>Production</tt> class. This +class has no public constructor--you should only create productions by calling <tt>Grammar.add_production()</tt>. +The following attributes are available on a <tt>Production</tt> instance <tt>p</tt>. + +<p> +<b><tt>p.name</tt></b> +<blockquote> +The name of the production. For a grammar rule such as <tt>A : B C D</tt>, this is <tt>'A'</tt>. +</blockquote> + +<p> +<b><tt>p.prod</tt></b> +<blockquote> +A tuple of symbols making up the right-hand side of the production. For a grammar rule such as <tt>A : B C D</tt>, this is <tt>('B','C','D')</tt>. +</blockquote> + +<p> +<b><tt>p.number</tt></b> +<blockquote> +Production number. An integer containing the index of the production in the grammar's <tt>Productions</tt> list. +</blockquote> + +<p> +<b><tt>p.func</tt></b> +<blockquote> +The name of the reduction function associated with the production. +This is the function that will execute when reducing the entire +grammar rule during parsing. +</blockquote> + +<p> +<b><tt>p.callable</tt></b> +<blockquote> +The callable object associated with the name in <tt>p.func</tt>. This is <tt>None</tt> +unless the production has been bound using <tt>bind()</tt>. +</blockquote> + +<p> +<b><tt>p.file</tt></b> +<blockquote> +Filename associated with the production. Typically this is the file where the production was defined. Used for error messages. +</blockquote> + +<p> +<b><tt>p.lineno</tt></b> +<blockquote> +Line number associated with the production. Typically this is the line number in <tt>p.file</tt> where the production was defined. Used for error messages. +</blockquote> + +<p> +<b><tt>p.prec</tt></b> +<blockquote> +Precedence and associativity associated with the production. This is a tuple <tt>(assoc,level)</tt> where +<tt>assoc</tt> is one of <tt>'left'</tt>,<tt>'right'</tt>, or <tt>'nonassoc'</tt> and <tt>level</tt> is +an integer. This value is determined by the precedence of the right-most terminal symbol in the production +or by use of the <tt>%prec</tt> specifier when adding the production. +</blockquote> + +<p> +<b><tt>p.usyms</tt></b> +<blockquote> +A list of all unique symbols found in the production. +</blockquote> + +<p> +<b><tt>p.lr_items</tt></b> +<blockquote> +A list of all LR items for this production. This attribute only has a meaningful value if the +<tt>Grammar.build_lritems()</tt> method has been called. The items in this list are +instances of <tt>LRItem</tt> described below. +</blockquote> + +<p> +<b><tt>p.lr_next</tt></b> +<blockquote> +The head of a linked-list representation of the LR items in <tt>p.lr_items</tt>. +This attribute only has a meaningful value if the <tt>Grammar.build_lritems()</tt> +method has been called. Each <tt>LRItem</tt> instance has a <tt>lr_next</tt> attribute +to move to the next item. The list is terminated by <tt>None</tt>. +</blockquote> + +<p> +<b><tt>p.bind(dict)</tt></b> +<blockquote> +Binds the production function name in <tt>p.func</tt> to a callable object in +<tt>dict</tt>. This operation is typically carried out in the last step +prior to running the parsing engine and is needed since parsing tables are typically +read from files which only include the function names, not the functions themselves. +</blockquote> + +<P> +<tt>Production</tt> objects support +the <tt>__len__()</tt>, <tt>__getitem__()</tt>, and <tt>__str__()</tt> +special methods. +<tt>len(p)</tt> returns the number of symbols in <tt>p.prod</tt> +and <tt>p[n]</tt> is the same as <tt>p.prod[n]</tt>. + +<H2><a name="internal_nn4"></a>4. LRItems</H2> + + +The construction of parsing tables in an LR-based parser generator is primarily +done over a set of "LR Items". An LR item represents a stage of parsing one +of the grammar rules. To compute the LR items, it is first necessary to +call <tt>Grammar.build_lritems()</tt>. Once this step, all of the productions +in the grammar will have their LR items attached to them. + +<p> +Here is an interactive example that shows what LR items look like if you +interactively experiment. In this example, <tt>g</tt> is a <tt>Grammar</tt> +object. + +<blockquote> +<pre> +>>> <b>g.build_lritems()</b> +>>> <b>p = g[1]</b> +>>> <b>p</b> +Production(statement -> ID = expr) +>>> +</pre> +</blockquote> + +In the above code, <tt>p</tt> represents the first grammar rule. In +this case, a rule <tt>'statement -> ID = expr'</tt>. + +<p> +Now, let's look at the LR items for <tt>p</tt>. + +<blockquote> +<pre> +>>> <b>p.lr_items</b> +[LRItem(statement -> . ID = expr), + LRItem(statement -> ID . = expr), + LRItem(statement -> ID = . expr), + LRItem(statement -> ID = expr .)] +>>> +</pre> +</blockquote> + +In each LR item, the dot (.) represents a specific stage of parsing. In each LR item, the dot +is advanced by one symbol. It is only when the dot reaches the very end that a production +is successfully parsed. + +<p> +An instance <tt>lr</tt> of <tt>LRItem</tt> has the following +attributes that hold information related to that specific stage of +parsing. + +<p> +<b><tt>lr.name</tt></b> +<blockquote> +The name of the grammar rule. For example, <tt>'statement'</tt> in the above example. +</blockquote> + +<p> +<b><tt>lr.prod</tt></b> +<blockquote> +A tuple of symbols representing the right-hand side of the production, including the +special <tt>'.'</tt> character. For example, <tt>('ID','.','=','expr')</tt>. +</blockquote> + +<p> +<b><tt>lr.number</tt></b> +<blockquote> +An integer representing the production number in the grammar. +</blockquote> + +<p> +<b><tt>lr.usyms</tt></b> +<blockquote> +A set of unique symbols in the production. Inherited from the original <tt>Production</tt> instance. +</blockquote> + +<p> +<b><tt>lr.lr_index</tt></b> +<blockquote> +An integer representing the position of the dot (.). You should never use <tt>lr.prod.index()</tt> +to search for it--the result will be wrong if the grammar happens to also use (.) as a character +literal. +</blockquote> + +<p> +<b><tt>lr.lr_after</tt></b> +<blockquote> +A list of all productions that can legally appear immediately to the right of the +dot (.). This list contains <tt>Production</tt> instances. This attribute +represents all of the possible branches a parse can take from the current position. +For example, suppose that <tt>lr</tt> represents a stage immediately before +an expression like this: + +<pre> +>>> <b>lr</b> +LRItem(statement -> ID = . expr) +>>> +</pre> + +Then, the value of <tt>lr.lr_after</tt> might look like this, showing all productions that +can legally appear next: + +<pre> +>>> <b>lr.lr_after</b> +[Production(expr -> expr PLUS expr), + Production(expr -> expr MINUS expr), + Production(expr -> expr TIMES expr), + Production(expr -> expr DIVIDE expr), + Production(expr -> MINUS expr), + Production(expr -> LPAREN expr RPAREN), + Production(expr -> NUMBER), + Production(expr -> ID)] +>>> +</pre> + +</blockquote> + +<p> +<b><tt>lr.lr_before</tt></b> +<blockquote> +The grammar symbol that appears immediately before the dot (.) or <tt>None</tt> if +at the beginning of the parse. +</blockquote> + +<p> +<b><tt>lr.lr_next</tt></b> +<blockquote> +A link to the next LR item, representing the next stage of the parse. <tt>None</tt> if <tt>lr</tt> +is the last LR item. +</blockquote> + +<tt>LRItem</tt> instances also support the <tt>__len__()</tt> and <tt>__getitem__()</tt> special methods. +<tt>len(lr)</tt> returns the number of items in <tt>lr.prod</tt> including the dot (.). <tt>lr[n]</tt> +returns <tt>lr.prod[n]</tt>. + +<p> +It goes without saying that all of the attributes associated with LR +items should be assumed to be read-only. Modifications will very +likely create a small black-hole that will consume you and your code. + +<H2><a name="internal_nn5"></a>5. LRTable</H2> + + +The <tt>LRTable</tt> class is used to represent LR parsing table data. This +minimally includes the production list, action table, and goto table. + +<p> +<b><tt>LRTable()</tt></b> +<blockquote> +Create an empty LRTable object. This object contains only the information needed to +run an LR parser. +</blockquote> + +An instance <tt>lrtab</tt> of <tt>LRTable</tt> has the following methods: + +<p> +<b><tt>lrtab.read_table(module)</tt></b> +<blockquote> +Populates the LR table with information from the module specified in <tt>module</tt>. +<tt>module</tt> is either a module object already loaded with <tt>import</tt> or +the name of a Python module. If it's a string containing a module name, it is +loaded and parsing data is extracted. Returns the signature value that was used +when initially writing the tables. Raises a <tt>VersionError</tt> exception if +the module was created using an incompatible version of PLY. +</blockquote> + +<p> +<b><tt>lrtab.bind_callables(dict)</tt></b> +<blockquote> +This binds all of the function names used in productions to callable objects +found in the dictionary <tt>dict</tt>. During table generation and when reading +LR tables from files, PLY only uses the names of action functions such as <tt>'p_expr'</tt>, +<tt>'p_statement'</tt>, etc. In order to actually run the parser, these names +have to be bound to callable objects. This method is always called prior to +running a parser. +</blockquote> + +After <tt>lrtab</tt> has been populated, the following attributes are defined. + +<p> +<b><tt>lrtab.lr_method</tt></b> +<blockquote> +The LR parsing method used (e.g., <tt>'LALR'</tt>) +</blockquote> + + +<p> +<b><tt>lrtab.lr_productions</tt></b> +<blockquote> +The production list. If the parsing tables have been newly +constructed, this will be a list of <tt>Production</tt> instances. If +the parsing tables have been read from a file, it's a list +of <tt>MiniProduction</tt> instances. This, together +with <tt>lr_action</tt> and <tt>lr_goto</tt> contain all of the +information needed by the LR parsing engine. +</blockquote> + +<p> +<b><tt>lrtab.lr_action</tt></b> +<blockquote> +The LR action dictionary that implements the underlying state machine. +The keys of this dictionary are the LR states. +</blockquote> + +<p> +<b><tt>lrtab.lr_goto</tt></b> +<blockquote> +The LR goto table that contains information about grammar rule reductions. +</blockquote> + + +<H2><a name="internal_nn6"></a>6. LRGeneratedTable</H2> + + +The <tt>LRGeneratedTable</tt> class represents constructed LR parsing tables on a +grammar. It is a subclass of <tt>LRTable</tt>. + +<p> +<b><tt>LRGeneratedTable(grammar, method='LALR',log=None)</tt></b> +<blockquote> +Create the LR parsing tables on a grammar. <tt>grammar</tt> is an instance of <tt>Grammar</tt>, +<tt>method</tt> is a string with the parsing method (<tt>'SLR'</tt> or <tt>'LALR'</tt>), and +<tt>log</tt> is a logger object used to write debugging information. The debugging information +written to <tt>log</tt> is the same as what appears in the <tt>parser.out</tt> file created +by yacc. By supplying a custom logger with a different message format, it is possible to get +more information (e.g., the line number in <tt>yacc.py</tt> used for issuing each line of +output in the log). The result is an instance of <tt>LRGeneratedTable</tt>. +</blockquote> + +<p> +An instance <tt>lr</tt> of <tt>LRGeneratedTable</tt> has the following attributes. + +<p> +<b><tt>lr.grammar</tt></b> +<blockquote> +A link to the Grammar object used to construct the parsing tables. +</blockquote> + +<p> +<b><tt>lr.lr_method</tt></b> +<blockquote> +The LR parsing method used (e.g., <tt>'LALR'</tt>) +</blockquote> + + +<p> +<b><tt>lr.lr_productions</tt></b> +<blockquote> +A reference to <tt>grammar.Productions</tt>. This, together with <tt>lr_action</tt> and <tt>lr_goto</tt> +contain all of the information needed by the LR parsing engine. +</blockquote> + +<p> +<b><tt>lr.lr_action</tt></b> +<blockquote> +The LR action dictionary that implements the underlying state machine. The keys of this dictionary are +the LR states. +</blockquote> + +<p> +<b><tt>lr.lr_goto</tt></b> +<blockquote> +The LR goto table that contains information about grammar rule reductions. +</blockquote> + +<p> +<b><tt>lr.sr_conflicts</tt></b> +<blockquote> +A list of tuples <tt>(state,token,resolution)</tt> identifying all shift/reduce conflicts. <tt>state</tt> is the LR state +number where the conflict occurred, <tt>token</tt> is the token causing the conflict, and <tt>resolution</tt> is +a string describing the resolution taken. <tt>resolution</tt> is either <tt>'shift'</tt> or <tt>'reduce'</tt>. +</blockquote> + +<p> +<b><tt>lr.rr_conflicts</tt></b> +<blockquote> +A list of tuples <tt>(state,rule,rejected)</tt> identifying all reduce/reduce conflicts. <tt>state</tt> is the +LR state number where the conflict occurred, <tt>rule</tt> is the production rule that was selected +and <tt>rejected</tt> is the production rule that was rejected. Both <tt>rule</tt> and </tt>rejected</tt> are +instances of <tt>Production</tt>. They can be inspected to provide the user with more information. +</blockquote> + +<p> +There are two public methods of <tt>LRGeneratedTable</tt>. + +<p> +<b><tt>lr.write_table(modulename,outputdir="",signature="")</tt></b> +<blockquote> +Writes the LR parsing table information to a Python module. <tt>modulename</tt> is a string +specifying the name of a module such as <tt>"parsetab"</tt>. <tt>outputdir</tt> is the name of a +directory where the module should be created. <tt>signature</tt> is a string representing a +grammar signature that's written into the output file. This can be used to detect when +the data stored in a module file is out-of-sync with the the grammar specification (and that +the tables need to be regenerated). If <tt>modulename</tt> is a string <tt>"parsetab"</tt>, +this function creates a file called <tt>parsetab.py</tt>. If the module name represents a +package such as <tt>"foo.bar.parsetab"</tt>, then only the last component, <tt>"parsetab"</tt> is +used. +</blockquote> + + +<H2><a name="internal_nn7"></a>7. LRParser</H2> + + +The <tt>LRParser</tt> class implements the low-level LR parsing engine. + + +<p> +<b><tt>LRParser(lrtab, error_func)</tt></b> +<blockquote> +Create an LRParser. <tt>lrtab</tt> is an instance of <tt>LRTable</tt> +containing the LR production and state tables. <tt>error_func</tt> is the +error function to invoke in the event of a parsing error. +</blockquote> + +An instance <tt>p</tt> of <tt>LRParser</tt> has the following methods: + +<p> +<b><tt>p.parse(input=None,lexer=None,debug=0,tracking=0,tokenfunc=None)</tt></b> +<blockquote> +Run the parser. <tt>input</tt> is a string, which if supplied is fed into the +lexer using its <tt>input()</tt> method. <tt>lexer</tt> is an instance of the +<tt>Lexer</tt> class to use for tokenizing. If not supplied, the last lexer +created with the <tt>lex</tt> module is used. <tt>debug</tt> is a boolean flag +that enables debugging. <tt>tracking</tt> is a boolean flag that tells the +parser to perform additional line number tracking. <tt>tokenfunc</tt> is a callable +function that returns the next token. If supplied, the parser will use it to get +all tokens. +</blockquote> + +<p> +<b><tt>p.restart()</tt></b> +<blockquote> +Resets the parser state for a parse already in progress. +</blockquote> + +<H2><a name="internal_nn8"></a>8. ParserReflect</H2> + + +<p> +The <tt>ParserReflect</tt> class is used to collect parser specification data +from a Python module or object. This class is what collects all of the +<tt>p_rule()</tt> functions in a PLY file, performs basic error checking, +and collects all of the needed information to build a grammar. Most of the +high-level PLY interface as used by the <tt>yacc()</tt> function is actually +implemented by this class. + +<p> +<b><tt>ParserReflect(pdict, log=None)</tt></b> +<blockquote> +Creates a <tt>ParserReflect</tt> instance. <tt>pdict</tt> is a dictionary +containing parser specification data. This dictionary typically corresponds +to the module or class dictionary of code that implements a PLY parser. +<tt>log</tt> is a logger instance that will be used to report error +messages. +</blockquote> + +An instance <tt>p</tt> of <tt>ParserReflect</tt> has the following methods: + +<p> +<b><tt>p.get_all()</tt></b> +<blockquote> +Collect and store all required parsing information. +</blockquote> + +<p> +<b><tt>p.validate_all()</tt></b> +<blockquote> +Validate all of the collected parsing information. This is a seprate step +from <tt>p.get_all()</tt> as a performance optimization. In order to +increase parser start-up time, a parser can elect to only validate the +parsing data when regenerating the parsing tables. The validation +step tries to collect as much information as possible rather than +raising an exception at the first sign of trouble. The attribute +<tt>p.error</tt> is set if there are any validation errors. The +value of this attribute is also returned. +</blockquote> + +<p> +<b><tt>p.signature()</tt></b> +<blockquote> +Compute a signature representing the contents of the collected parsing +data. The signature value should change if anything in the parser +specification has changed in a way that would justify parser table +regeneration. This method can be called after <tt>p.get_all()</tt>, +but before <tt>p.validate_all()</tt>. +</blockquote> + +The following attributes are set in the process of collecting data: + +<p> +<b><tt>p.start</tt></b> +<blockquote> +The grammar start symbol, if any. Taken from <tt>pdict['start']</tt>. +</blockquote> + +<p> +<b><tt>p.error_func</tt></b> +<blockquote> +The error handling function or <tt>None</tt>. Taken from <tt>pdict['p_error']</tt>. +</blockquote> + +<p> +<b><tt>p.tokens</tt></b> +<blockquote> +The token list. Taken from <tt>pdict['tokens']</tt>. +</blockquote> + +<p> +<b><tt>p.prec</tt></b> +<blockquote> +The precedence specifier. Taken from <tt>pdict['precedence']</tt>. +</blockquote> + +<p> +<b><tt>p.preclist</tt></b> +<blockquote> +A parsed version of the precedence specified. A list of tuples of the form +<tt>(token,assoc,level)</tt> where <tt>token</tt> is the terminal symbol, +<tt>assoc</tt> is the associativity (e.g., <tt>'left'</tt>) and <tt>level</tt> +is a numeric precedence level. +</blockquote> + +<p> +<b><tt>p.grammar</tt></b> +<blockquote> +A list of tuples <tt>(name, rules)</tt> representing the grammar rules. <tt>name</tt> is the +name of a Python function or method in <tt>pdict</tt> that starts with <tt>"p_"</tt>. +<tt>rules</tt> is a list of tuples <tt>(filename,line,prodname,syms)</tt> representing +the grammar rules found in the documentation string of that function. <tt>filename</tt> and <tt>line</tt> contain location +information that can be used for debugging. <tt>prodname</tt> is the name of the +production. <tt>syms</tt> is the right-hand side of the production. If you have a +function like this + +<pre> +def p_expr(p): + '''expr : expr PLUS expr + | expr MINUS expr + | expr TIMES expr + | expr DIVIDE expr''' +</pre> + +then the corresponding entry in <tt>p.grammar</tt> might look like this: + +<pre> +('p_expr', [ ('calc.py',10,'expr', ['expr','PLUS','expr']), + ('calc.py',11,'expr', ['expr','MINUS','expr']), + ('calc.py',12,'expr', ['expr','TIMES','expr']), + ('calc.py',13,'expr', ['expr','DIVIDE','expr']) + ]) +</pre> +</blockquote> + +<p> +<b><tt>p.pfuncs</tt></b> +<blockquote> +A sorted list of tuples <tt>(line, file, name, doc)</tt> representing all of +the <tt>p_</tt> functions found. <tt>line</tt> and <tt>file</tt> give location +information. <tt>name</tt> is the name of the function. <tt>doc</tt> is the +documentation string. This list is sorted in ascending order by line number. +</blockquote> + +<p> +<b><tt>p.files</tt></b> +<blockquote> +A dictionary holding all of the source filenames that were encountered +while collecting parser information. Only the keys of this dictionary have +any meaning. +</blockquote> + +<p> +<b><tt>p.error</tt></b> +<blockquote> +An attribute that indicates whether or not any critical errors +occurred in validation. If this is set, it means that that some kind +of problem was detected and that no further processing should be +performed. +</blockquote> + + +<H2><a name="internal_nn9"></a>9. High-level operation</H2> + + +Using all of the above classes requires some attention to detail. The <tt>yacc()</tt> +function carries out a very specific sequence of operations to create a grammar. +This same sequence should be emulated if you build an alternative PLY interface. + +<ol> +<li>A <tt>ParserReflect</tt> object is created and raw grammar specification data is +collected. +<li>A <tt>Grammar</tt> object is created and populated with information +from the specification data. +<li>A <tt>LRGenerator</tt> object is created to run the LALR algorithm over +the <tt>Grammar</tt> object. +<li>Productions in the LRGenerator and bound to callables using the <tt>bind_callables()</tt> +method. +<li>A <tt>LRParser</tt> object is created from from the information in the +<tt>LRGenerator</tt> object. +</ol> + +</body> +</html> + + + + + + + diff --git a/ext/ply/doc/makedoc.py b/ext/ply/doc/makedoc.py index 3eed9bd74..415a53aa0 100644 --- a/ext/ply/doc/makedoc.py +++ b/ext/ply/doc/makedoc.py @@ -93,7 +93,7 @@ for s in lines: result.append("") result.append("") skipspace = 0 - + m = h2.match(s) if m: prevheadingtext = m.group(2) @@ -115,7 +115,7 @@ for s in lines: subsection = 0 subsubsection = 0 subsubsubsection = 0 - skipspace = 1 + skipspace = 1 continue m = h3.match(s) if m: @@ -134,7 +134,7 @@ for s in lines: index += """<li><a href="#%s">%s</a>\n""" % (headingname,prevheadingtext) subsubsection = 0 - skipspace = 1 + skipspace = 1 continue m = h4.match(s) if m: @@ -151,7 +151,7 @@ for s in lines: index += "<ul>\n" index += """<li><a href="#%s">%s</a>\n""" % (headingname,prevheadingtext) - skipspace = 1 + skipspace = 1 continue m = h5.match(s) if m: @@ -167,7 +167,7 @@ for s in lines: index += """<li><a href="#%s">%s</a>\n""" % (headingname,prevheadingtext) skipspace = 1 continue - + result.append(s) if subsubsubsection: diff --git a/ext/ply/doc/ply.html b/ext/ply/doc/ply.html index dba0c6288..3345e7929 100644 --- a/ext/ply/doc/ply.html +++ b/ext/ply/doc/ply.html @@ -12,12 +12,13 @@ dave@dabeaz.com<br> </b> <p> -<b>PLY Version: 2.3</b> +<b>PLY Version: 3.0</b> <p> <!-- INDEX --> <div class="sectiontoc"> <ul> +<li><a href="#ply_nn1">Preface and Requirements</a> <li><a href="#ply_nn1">Introduction</a> <li><a href="#ply_nn2">PLY Overview</a> <li><a href="#ply_nn3">Lex</a> @@ -37,13 +38,13 @@ dave@dabeaz.com<br> <li><a href="#ply_nn16">Debugging</a> <li><a href="#ply_nn17">Alternative specification of lexers</a> <li><a href="#ply_nn18">Maintaining state</a> -<li><a href="#ply_nn19">Duplicating lexers</a> +<li><a href="#ply_nn19">Lexer cloning</a> <li><a href="#ply_nn20">Internal lexer state</a> <li><a href="#ply_nn21">Conditional lexing and start conditions</a> <li><a href="#ply_nn21">Miscellaneous Issues</a> </ul> <li><a href="#ply_nn22">Parsing basics</a> -<li><a href="#ply_nn23">Yacc reference</a> +<li><a href="#ply_nn23">Yacc</a> <ul> <li><a href="#ply_nn24">An example</a> <li><a href="#ply_nn25">Combining Grammar Rule Functions</a> @@ -56,15 +57,21 @@ dave@dabeaz.com<br> <ul> <li><a href="#ply_nn30">Recovery and resynchronization with error rules</a> <li><a href="#ply_nn31">Panic mode recovery</a> +<li><a href="#ply_nn35">Signaling an error from a production</a> <li><a href="#ply_nn32">General comments on error handling</a> </ul> <li><a href="#ply_nn33">Line Number and Position Tracking</a> <li><a href="#ply_nn34">AST Construction</a> <li><a href="#ply_nn35">Embedded Actions</a> -<li><a href="#ply_nn36">Yacc implementation notes</a> +<li><a href="#ply_nn36">Miscellaneous Yacc Notes</a> </ul> -<li><a href="#ply_nn37">Parser and Lexer State Management</a> +<li><a href="#ply_nn37">Multiple Parsers and Lexers</a> <li><a href="#ply_nn38">Using Python's Optimized Mode</a> +<li><a href="#ply_nn44">Advanced Debugging</a> +<ul> +<li><a href="#ply_nn45">Debugging the lex() and yacc() commands</a> +<li><a href="#ply_nn46">Run-time Debugging</a> +</ul> <li><a href="#ply_nn39">Where to go from here?</a> </ul> </div> @@ -72,10 +79,26 @@ dave@dabeaz.com<br> +<H2><a name="ply_nn1"></a>1. Preface and Requirements</H2> +<p> +This document provides an overview of lexing and parsing with PLY. +Given the intrinsic complexity of parsing, I would strongly advise +that you read (or at least skim) this entire document before jumping +into a big development project with PLY. +</p> -<H2><a name="ply_nn1"></a>1. Introduction</H2> +<p> +PLY-3.0 is compatible with both Python 2 and Python 3. Be aware that +Python 3 support is new and has not been extensively tested (although +all of the examples and unit tests pass under Python 3.0). If you are +using Python 2, you should try to use Python 2.4 or newer. Although PLY +works with versions as far back as Python 2.2, some of its optional features +require more modern library modules. +</p> + +<H2><a name="ply_nn1"></a>2. Introduction</H2> PLY is a pure-Python implementation of the popular compiler @@ -95,7 +118,10 @@ include lexical analysis, parsing, type checking, type inference, nested scoping, and code generation for the SPARC processor. Approximately 30 different compiler implementations were completed in this course. Most of PLY's interface and operation has been influenced by common -usability problems encountered by students. +usability problems encountered by students. Since 2001, PLY has +continued to be improved as feedback has been received from users. +PLY-3.0 represents a major refactoring of the original implementation +with an eye towards future enhancements. <p> Since PLY was primarily developed as an instructional tool, you will @@ -120,7 +146,7 @@ Techniques, and Tools", by Aho, Sethi, and Ullman. O'Reilly's "Lex and Yacc" by John Levine may also be handy. In fact, the O'Reilly book can be used as a reference for PLY as the concepts are virtually identical. -<H2><a name="ply_nn2"></a>2. PLY Overview</H2> +<H2><a name="ply_nn2"></a>3. PLY Overview</H2> PLY consists of two separate modules; <tt>lex.py</tt> and @@ -163,7 +189,7 @@ parsing tables is relatively expensive, PLY caches the results and saves them to a file. If no changes are detected in the input source, the tables are read from the cache. Otherwise, they are regenerated. -<H2><a name="ply_nn3"></a>3. Lex</H2> +<H2><a name="ply_nn3"></a>4. Lex</H2> <tt>lex.py</tt> is used to tokenize an input string. For example, suppose @@ -206,7 +232,7 @@ More specifically, the input is broken into pairs of token types and values. Fo The identification of tokens is typically done by writing a series of regular expression rules. The next section shows how this is done using <tt>lex.py</tt>. -<H3><a name="ply_nn4"></a>3.1 Lex Example</H3> +<H3><a name="ply_nn4"></a>4.1 Lex Example</H3> The following example shows how <tt>lex.py</tt> is used to write a simple tokenizer. @@ -243,11 +269,7 @@ t_RPAREN = r'\)' # A regular expression rule with some action code def t_NUMBER(t): r'\d+' - try: - t.value = int(t.value) - except ValueError: - print "Line %d: Number %s is too large!" % (t.lineno,t.value) - t.value = 0 + t.value = int(t.value) return t # Define a rule so we can track line numbers @@ -264,11 +286,14 @@ def t_error(t): t.lexer.skip(1) # Build the lexer -lex.lex() +lexer = lex.lex() </pre> </blockquote> -To use the lexer, you first need to feed it some input text using its <tt>input()</tt> method. After that, repeated calls to <tt>token()</tt> produce tokens. The following code shows how this works: +To use the lexer, you first need to feed it some input text using +its <tt>input()</tt> method. After that, repeated calls +to <tt>token()</tt> produce tokens. The following code shows how this +works: <blockquote> <pre> @@ -280,11 +305,11 @@ data = ''' ''' # Give the lexer some input -lex.input(data) +lexer.input(data) # Tokenize -while 1: - tok = lex.token() +while True: + tok = lexer.token() if not tok: break # No more input print tok </pre> @@ -308,7 +333,16 @@ LexToken(NUMBER,2,3,21) </pre> </blockquote> -The tokens returned by <tt>lex.token()</tt> are instances +Lexers also support the iteration protocol. So, you can write the above loop as follows: + +<blockquote> +<pre> +for tok in lexer: + print tok +</pre> +</blockquote> + +The tokens returned by <tt>lexer.token()</tt> are instances of <tt>LexToken</tt>. This object has attributes <tt>tok.type</tt>, <tt>tok.value</tt>, <tt>tok.lineno</tt>, and <tt>tok.lexpos</tt>. The following code shows an example of @@ -317,8 +351,8 @@ accessing these attributes: <blockquote> <pre> # Tokenize -while 1: - tok = lex.token() +while True: + tok = lexer.token() if not tok: break # No more input print tok.type, tok.value, tok.line, tok.lexpos </pre> @@ -330,7 +364,7 @@ type and value of the token itself. the location of the token. <tt>tok.lexpos</tt> is the index of the token relative to the start of the input text. -<H3><a name="ply_nn5"></a>3.2 The tokens list</H3> +<H3><a name="ply_nn5"></a>4.2 The tokens list</H3> All lexers must provide a list <tt>tokens</tt> that defines all of the possible token @@ -355,7 +389,7 @@ tokens = ( </pre> </blockquote> -<H3><a name="ply_nn6"></a>3.3 Specification of tokens</H3> +<H3><a name="ply_nn6"></a>4.3 Specification of tokens</H3> Each token is specified by writing a regular expression rule. Each of these rules are @@ -379,11 +413,7 @@ converts the string into a Python integer. <pre> def t_NUMBER(t): r'\d+' - try: - t.value = int(t.value) - except ValueError: - print "Number %s is too large!" % t.value - t.value = 0 + t.value = int(t.value) return t </pre> </blockquote> @@ -414,8 +444,8 @@ expressions in order of decreasing length, this problem is solved for rules defi the order can be explicitly controlled since rules appearing first are checked first. <p> -To handle reserved words, it is usually easier to just match an identifier and do a special name lookup in a function -like this: +To handle reserved words, you should write a single rule to match an +identifier and do a special name lookup in a function like this: <blockquote> <pre> @@ -427,6 +457,8 @@ reserved = { ... } +tokens = ['LPAREN','RPAREN',...,'ID'] + list(reserved.values()) + def t_ID(t): r'[a-zA-Z_][a-zA-Z_0-9]*' t.type = reserved.get(t.value,'ID') # Check for reserved words @@ -449,7 +481,7 @@ t_PRINT = r'print' those rules will be triggered for identifiers that include those words as a prefix such as "forget" or "printed". This is probably not what you want. -<H3><a name="ply_nn7"></a>3.4 Token values</H3> +<H3><a name="ply_nn7"></a>4.4 Token values</H3> When tokens are returned by lex, they have a value that is stored in the <tt>value</tt> attribute. Normally, the value is the text @@ -468,9 +500,10 @@ def t_ID(t): </blockquote> It is important to note that storing data in other attribute names is <em>not</em> recommended. The <tt>yacc.py</tt> module only exposes the -contents of the <tt>value</tt> attribute. Thus, accessing other attributes may be unnecessarily awkward. +contents of the <tt>value</tt> attribute. Thus, accessing other attributes may be unnecessarily awkward. If you +need to store multiple values on a token, assign a tuple, dictionary, or instance to <tt>value</tt>. -<H3><a name="ply_nn8"></a>3.5 Discarded tokens</H3> +<H3><a name="ply_nn8"></a>4.5 Discarded tokens</H3> To discard a token, such as a comment, simply define a token rule that returns no value. For example: @@ -496,7 +529,7 @@ Be advised that if you are ignoring many different kinds of text, you may still control over the order in which regular expressions are matched (i.e., functions are matched in order of specification whereas strings are sorted by regular expression length). -<H3><a name="ply_nn9"></a>3.6 Line numbers and positional information</H3> +<H3><a name="ply_nn9"></a>4.6 Line numbers and positional information</H3> <p>By default, <tt>lex.py</tt> knows nothing about line numbers. This is because <tt>lex.py</tt> doesn't know anything @@ -525,11 +558,10 @@ column information as a separate step. For instance, just count backwards unti # input is the input text string # token is a token instance def find_column(input,token): - i = token.lexpos - while i > 0: - if input[i] == '\n': break - i -= 1 - column = (token.lexpos - i)+1 + last_cr = input.rfind('\n',0,token.lexpos) + if last_cr < 0: + last_cr = 0 + column = (token.lexpos - last_cr) + 1 return column </pre> </blockquote> @@ -537,7 +569,7 @@ def find_column(input,token): Since column information is often only useful in the context of error handling, calculating the column position can be performed when needed as opposed to doing it for each token. -<H3><a name="ply_nn10"></a>3.7 Ignored characters</H3> +<H3><a name="ply_nn10"></a>4.7 Ignored characters</H3> <p> @@ -549,7 +581,7 @@ similar to <tt>t_newline()</tt>, the use of <tt>t_ignore</tt> provides substanti lexing performance because it is handled as a special case and is checked in a much more efficient manner than the normal regular expression rules. -<H3><a name="ply_nn11"></a>3.8 Literal characters</H3> +<H3><a name="ply_nn11"></a>4.8 Literal characters</H3> <p> @@ -575,7 +607,7 @@ take precedence. <p> When a literal token is returned, both its <tt>type</tt> and <tt>value</tt> attributes are set to the character itself. For example, <tt>'+'</tt>. -<H3><a name="ply_nn12"></a>3.9 Error handling</H3> +<H3><a name="ply_nn12"></a>4.9 Error handling</H3> <p> @@ -596,44 +628,42 @@ def t_error(t): In this case, we simply print the offending character and skip ahead one character by calling <tt>t.lexer.skip(1)</tt>. -<H3><a name="ply_nn13"></a>3.10 Building and using the lexer</H3> +<H3><a name="ply_nn13"></a>4.10 Building and using the lexer</H3> <p> To build the lexer, the function <tt>lex.lex()</tt> is used. This function uses Python reflection (or introspection) to read the the regular expression rules -out of the calling context and build the lexer. Once the lexer has been built, two functions can +out of the calling context and build the lexer. Once the lexer has been built, two methods can be used to control the lexer. <ul> -<li><tt>lex.input(data)</tt>. Reset the lexer and store a new input string. -<li><tt>lex.token()</tt>. Return the next token. Returns a special <tt>LexToken</tt> instance on success or +<li><tt>lexer.input(data)</tt>. Reset the lexer and store a new input string. +<li><tt>lexer.token()</tt>. Return the next token. Returns a special <tt>LexToken</tt> instance on success or None if the end of the input text has been reached. </ul> -If desired, the lexer can also be used as an object. The <tt>lex()</tt> returns a <tt>Lexer</tt> object that -can be used for this purpose. For example: +The preferred way to use PLY is to invoke the above methods directly on the lexer object returned by the +<tt>lex()</tt> function. The legacy interface to PLY involves module-level functions <tt>lex.input()</tt> and <tt>lex.token()</tt>. +For example: <blockquote> <pre> -lexer = lex.lex() -lexer.input(sometext) +lex.lex() +lex.input(sometext) while 1: - tok = lexer.token() + tok = lex.token() if not tok: break print tok </pre> </blockquote> <p> -This latter technique should be used if you intend to use multiple lexers in your application. Simply define each -lexer in its own module and use the object returned by <tt>lex()</tt> as appropriate. +In this example, the module-level functions <tt>lex.input()</tt> and <tt>lex.token()</tt> are bound to the <tt>input()</tt> +and <tt>token()</tt> methods of the last lexer created by the lex module. This interface may go away at some point so +it's probably best not to use it. -<p> -Note: The global functions <tt>lex.input()</tt> and <tt>lex.token()</tt> are bound to the <tt>input()</tt> -and <tt>token()</tt> methods of the last lexer created by the lex module. - -<H3><a name="ply_nn14"></a>3.11 The @TOKEN decorator</H3> +<H3><a name="ply_nn14"></a>4.11 The @TOKEN decorator</H3> In some applications, you may want to define build tokens from as a series of @@ -680,7 +710,7 @@ t_ID.__doc__ = identifier <b>NOTE:</b> Use of <tt>@TOKEN</tt> requires Python-2.4 or newer. If you're concerned about backwards compatibility with older versions of Python, use the alternative approach of setting the docstring directly. -<H3><a name="ply_nn15"></a>3.12 Optimized mode</H3> +<H3><a name="ply_nn15"></a>4.12 Optimized mode</H3> For improved performance, it may be desirable to use Python's @@ -717,7 +747,7 @@ lexer = lex.lex(optimize=1,lextab="footab") When running in optimized mode, it is important to note that lex disables most error checking. Thus, this is really only recommended if you're sure everything is working correctly and you're ready to start releasing production code. -<H3><a name="ply_nn16"></a>3.13 Debugging</H3> +<H3><a name="ply_nn16"></a>4.13 Debugging</H3> For the purpose of debugging, you can run <tt>lex()</tt> in a debugging mode as follows: @@ -728,12 +758,16 @@ lexer = lex.lex(debug=1) </pre> </blockquote> -This will result in a large amount of debugging information to be printed including all of the added rules and the master -regular expressions. +<p> +This will produce various sorts of debugging information including all of the added rules, +the master regular expressions used by the lexer, and tokens generating during lexing. +</p> +<p> In addition, <tt>lex.py</tt> comes with a simple main function which will either tokenize input read from standard input or from a file specified on the command line. To use it, simply put this in your lexer: +</p> <blockquote> <pre> @@ -742,7 +776,10 @@ if __name__ == '__main__': </pre> </blockquote> -<H3><a name="ply_nn17"></a>3.14 Alternative specification of lexers</H3> +Please refer to the "Debugging" section near the end for some more advanced details +of debugging. + +<H3><a name="ply_nn17"></a>4.14 Alternative specification of lexers</H3> As shown in the example, lexers are specified all within one Python module. If you want to @@ -780,11 +817,7 @@ t_RPAREN = r'\)' # A regular expression rule with some action code def t_NUMBER(t): r'\d+' - try: - t.value = int(t.value) - except ValueError: - print "Line %d: Number %s is too large!" % (t.lineno,t.value) - t.value = 0 + t.value = int(t.value) return t # Define a rule so we can track line numbers @@ -821,7 +854,7 @@ None </pre> </blockquote> -The <tt>object</tt> option can be used to define lexers as a class instead of a module. For example: +The <tt>module</tt> option can also be used to define lexers from instances of a class. For example: <blockquote> <pre> @@ -851,11 +884,7 @@ class MyLexer: # Note addition of self parameter since we're in a class def t_NUMBER(self,t): r'\d+' - try: - t.value = int(t.value) - except ValueError: - print "Line %d: Number %s is too large!" % (t.lineno,t.value) - t.value = 0 + t.value = int(t.value) return t # Define a rule so we can track line numbers @@ -873,12 +902,12 @@ class MyLexer: <b># Build the lexer def build(self,**kwargs): - self.lexer = lex.lex(object=self, **kwargs)</b> + self.lexer = lex.lex(module=self, **kwargs)</b> # Test it output def test(self,data): self.lexer.input(data) - while 1: + while True: tok = lexer.token() if not tok: break print tok @@ -890,14 +919,81 @@ m.test("3 + 4") # Test it </pre> </blockquote> -For reasons that are subtle, you should <em>NOT</em> invoke <tt>lex.lex()</tt> inside the <tt>__init__()</tt> method of your class. If you -do, it may cause bizarre behavior if someone tries to duplicate a lexer object. Keep reading. -<H3><a name="ply_nn18"></a>3.15 Maintaining state</H3> +When building a lexer from class, <em>you should construct the lexer from +an instance of the class</em>, not the class object itself. This is because +PLY only works properly if the lexer actions are defined by bound-methods. + +<p> +When using the <tt>module</tt> option to <tt>lex()</tt>, PLY collects symbols +from the underlying object using the <tt>dir()</tt> function. There is no +direct access to the <tt>__dict__</tt> attribute of the object supplied as a +module value. + +<P> +Finally, if you want to keep things nicely encapsulated, but don't want to use a +full-fledged class definition, lexers can be defined using closures. For example: + +<blockquote> +<pre> +import ply.lex as lex + +# List of token names. This is always required +tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', +) + +def MyLexer(): + # Regular expression rules for simple tokens + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + + # A regular expression rule with some action code + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + + # Define a rule so we can track line numbers + def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + + # A string containing ignored characters (spaces and tabs) + t_ignore = ' \t' + + # Error handling rule + def t_error(t): + print "Illegal character '%s'" % t.value[0] + t.lexer.skip(1) + + # Build the lexer from my environment and return it + return lex.lex() +</pre> +</blockquote> + + +<H3><a name="ply_nn18"></a>4.15 Maintaining state</H3> -In your lexer, you may want to maintain a variety of state information. This might include mode settings, symbol tables, and other details. There are a few -different ways to handle this situation. First, you could just keep some global variables: +In your lexer, you may want to maintain a variety of state +information. This might include mode settings, symbol tables, and +other details. As an example, suppose that you wanted to keep +track of how many NUMBER tokens had been encountered. + +<p> +One way to do this is to keep a set of global variables in the module +where you created the lexer. For example: <blockquote> <pre> @@ -906,28 +1002,22 @@ def t_NUMBER(t): r'\d+' global num_count num_count += 1 - try: - t.value = int(t.value) - except ValueError: - print "Line %d: Number %s is too large!" % (t.lineno,t.value) - t.value = 0 + t.value = int(t.value) return t </pre> </blockquote> -Alternatively, you can store this information inside the Lexer object created by <tt>lex()</tt>. To this, you can use the <tt>lexer</tt> attribute -of tokens passed to the various rules. For example: +If you don't like the use of a global variable, another place to store +information is inside the Lexer object created by <tt>lex()</tt>. +To this, you can use the <tt>lexer</tt> attribute of tokens passed to +the various rules. For example: <blockquote> <pre> def t_NUMBER(t): r'\d+' t.lexer.num_count += 1 # Note use of lexer attribute - try: - t.value = int(t.value) - except ValueError: - print "Line %d: Number %s is too large!" % (t.lineno,t.value) - t.value = 0 + t.value = int(t.value) return t lexer = lex.lex() @@ -935,17 +1025,20 @@ lexer.num_count = 0 # Set the initial count </pre> </blockquote> -This latter approach has the advantage of storing information inside -the lexer itself---something that may be useful if multiple instances -of the same lexer have been created. However, it may also feel kind -of "hacky" to the purists. Just to put their mind at some ease, all +This latter approach has the advantage of being simple and working +correctly in applications where multiple instantiations of a given +lexer exist in the same application. However, this might also feel +like a gross violation of encapsulation to OO purists. +Just to put your mind at some ease, all internal attributes of the lexer (with the exception of <tt>lineno</tt>) have names that are prefixed by <tt>lex</tt> (e.g., <tt>lexdata</tt>,<tt>lexpos</tt>, etc.). Thus, -it should be perfectly safe to store attributes in the lexer that -don't have names starting with that prefix. +it is perfectly safe to store attributes in the lexer that +don't have names starting with that prefix or a name that conlicts with one of the +predefined methods (e.g., <tt>input()</tt>, <tt>token()</tt>, etc.). <p> -A third approach is to define the lexer as a class as shown in the previous example: +If you don't like assigning values on the lexer object, you can define your lexer as a class as +shown in the previous section: <blockquote> <pre> @@ -954,11 +1047,7 @@ class MyLexer: def t_NUMBER(self,t): r'\d+' self.num_count += 1 - try: - t.value = int(t.value) - except ValueError: - print "Line %d: Number %s is too large!" % (t.lineno,t.value) - t.value = 0 + t.value = int(t.value) return t def build(self, **kwargs): @@ -966,23 +1055,36 @@ class MyLexer: def __init__(self): self.num_count = 0 - -# Create a lexer -m = MyLexer() -lexer = lex.lex(object=m) </pre> </blockquote> -The class approach may be the easiest to manage if your application is going to be creating multiple instances of the same lexer and -you need to manage a lot of state. +The class approach may be the easiest to manage if your application is +going to be creating multiple instances of the same lexer and you need +to manage a lot of state. -<H3><a name="ply_nn19"></a>3.16 Duplicating lexers</H3> +<p> +State can also be managed through closures. For example, in Python 3: +<blockquote> +<pre> +def MyLexer(): + num_count = 0 + ... + def t_NUMBER(t): + r'\d+' + nonlocal num_count + num_count += 1 + t.value = int(t.value) + return t + ... +</pre> +</blockquote> + +<H3><a name="ply_nn19"></a>4.16 Lexer cloning</H3> -<b>NOTE: I am thinking about deprecating this feature. Post comments on <a href="http://groups.google.com/group/ply-hack">ply-hack@googlegroups.com</a> or send me a private email at dave@dabeaz.com.</b> <p> -If necessary, a lexer object can be quickly duplicated by invoking its <tt>clone()</tt> method. For example: +If necessary, a lexer object can be duplicated by invoking its <tt>clone()</tt> method. For example: <blockquote> <pre> @@ -992,23 +1094,25 @@ newlexer = lexer.clone() </pre> </blockquote> -When a lexer is cloned, the copy is identical to the original lexer, -including any input text. However, once created, different text can be -fed to the clone which can be used independently. This capability may -be useful in situations when you are writing a parser/compiler that +When a lexer is cloned, the copy is exactly identical to the original lexer +including any input text and internal state. However, the clone allows a +different set of input text to be supplied which may be processed separately. +This may be useful in situations when you are writing a parser/compiler that involves recursive or reentrant processing. For instance, if you needed to scan ahead in the input for some reason, you could create a -clone and use it to look ahead. +clone and use it to look ahead. Or, if you were implementing some kind of preprocessor, +cloned lexers could be used to handle different input files. <p> -The advantage of using <tt>clone()</tt> instead of reinvoking <tt>lex()</tt> is -that it is significantly faster. Namely, it is not necessary to re-examine all of the -token rules, build a regular expression, and construct internal tables. All of this -information can simply be reused in the new lexer. +Creating a clone is different than calling <tt>lex.lex()</tt> in that +PLY doesn't regenerate any of the internal tables or regular expressions. So, <p> -Special considerations need to be made when cloning a lexer that is defined as a class. Previous sections -showed an example of a class <tt>MyLexer</tt>. If you have the following code: +Special considerations need to be made when cloning lexers that also +maintain their own internal state using classes or closures. Namely, +you need to be aware that the newly created lexers will share all of +this state with the original lexer. For example, if you defined a +lexer as a class and did this: <blockquote> <pre> @@ -1020,43 +1124,12 @@ b = a.clone() # Clone the lexer </blockquote> Then both <tt>a</tt> and <tt>b</tt> are going to be bound to the same -object <tt>m</tt>. If the object <tt>m</tt> contains internal state -related to lexing, this sharing may lead to quite a bit of confusion. To fix this, -the <tt>clone()</tt> method accepts an optional argument that can be used to supply a new object. This -can be used to clone the lexer and bind it to a new instance. For example: +object <tt>m</tt> and any changes to <tt>m</tt> will be reflected in both lexers. It's +important to emphasize that <tt>clone()</tt> is only meant to create a new lexer +that reuses the regular expressions and environment of another lexer. If you +need to make a totally new copy of a lexer, then call <tt>lex()</tt> again. -<blockquote> -<pre> -m = MyLexer() # Create a lexer -a = lex.lex(object=m) - -# Create a clone -n = MyLexer() # New instance of MyLexer -b = a.clone(n) # New lexer bound to n -</pre> -</blockquote> - -It may make sense to encapsulate all of this inside a method: - -<blockquote> -<pre> -class MyLexer: - ... - def clone(self): - c = MyLexer() # Create a new instance of myself - # Copy attributes from self to c as appropriate - ... - # Clone the lexer - c.lexer = self.lexer.clone(c) - return c -</pre> -</blockquote> - -The fact that a new instance of <tt>MyLexer</tt> may be created while cloning a lexer is the reason why you should never -invoke <tt>lex.lex()</tt> inside <tt>__init__()</tt>. If you do, the lexer will be rebuilt from scratch and you lose -all of the performance benefits of using <tt>clone()</tt> in the first place. - -<H3><a name="ply_nn20"></a>3.17 Internal lexer state</H3> +<H3><a name="ply_nn20"></a>4.17 Internal lexer state</H3> A Lexer object <tt>lexer</tt> has a number of internal attributes that may be useful in certain @@ -1074,8 +1147,9 @@ matched at the new position. <p> <tt>lexer.lineno</tt> <blockquote> -The current value of the line number attribute stored in the lexer. This can be modified as needed to -change the line number. +The current value of the line number attribute stored in the lexer. PLY only specifies that the attribute +exists---it never sets, updates, or performs any processing with it. If you want to track line numbers, +you will need to add code yourself (see the section on line numbers and positional information). </blockquote> <p> @@ -1090,9 +1164,10 @@ would probably be a bad idea to modify this unless you really know what you're d <blockquote> This is the raw <tt>Match</tt> object returned by the Python <tt>re.match()</tt> function (used internally by PLY) for the current token. If you have written a regular expression that contains named groups, you can use this to retrieve those values. +Note: This attribute is only updated when tokens are defined and processed by functions. </blockquote> -<H3><a name="ply_nn21"></a>3.18 Conditional lexing and start conditions</H3> +<H3><a name="ply_nn21"></a>4.18 Conditional lexing and start conditions</H3> In advanced parsing applications, it may be useful to have different @@ -1291,7 +1366,7 @@ However, if the closing right brace is encountered, the rule <tt>t_ccode_rbrace< position), stores it, and returns a token 'CCODE' containing all of that text. When returning the token, the lexing state is restored back to its initial state. -<H3><a name="ply_nn21"></a>3.19 Miscellaneous Issues</H3> +<H3><a name="ply_nn21"></a>4.19 Miscellaneous Issues</H3> <P> @@ -1331,7 +1406,7 @@ tokens are available. <li>The <tt>token()</tt> method must return an object <tt>tok</tt> that has <tt>type</tt> and <tt>value</tt> attributes. </ul> -<H2><a name="ply_nn22"></a>4. Parsing basics</H2> +<H2><a name="ply_nn22"></a>5. Parsing basics</H2> <tt>yacc.py</tt> is used to parse language syntax. Before showing an @@ -1357,9 +1432,10 @@ factor : NUMBER </blockquote> In the grammar, symbols such as <tt>NUMBER</tt>, <tt>+</tt>, <tt>-</tt>, <tt>*</tt>, and <tt>/</tt> are known -as <em>terminals</em> and correspond to raw input tokens. Identifiers such as <tt>term</tt> and <tt>factor</tt> refer to more -complex rules, typically comprised of a collection of tokens. These identifiers are known as <em>non-terminals</em>. +as <em>terminals</em> and correspond to raw input tokens. Identifiers such as <tt>term</tt> and <tt>factor</tt> refer to +grammar rules comprised of a collection of terminals and other rules. These identifiers are known as <em>non-terminals</em>. <P> + The semantic behavior of a language is often specified using a technique known as syntax directed translation. In syntax directed translation, attributes are attached to each symbol in a given grammar @@ -1385,9 +1461,12 @@ factor : NUMBER factor.val = int(NUMBER.lexval) </pre> </blockquote> -A good way to think about syntax directed translation is to simply think of each symbol in the grammar as some -kind of object. The semantics of the language are then expressed as a collection of methods/operations on these -objects. +A good way to think about syntax directed translation is to +view each symbol in the grammar as a kind of object. Associated +with each symbol is a value representing its "state" (for example, the +<tt>val</tt> attribute above). Semantic +actions are then expressed as a collection of functions or methods +that operate on the symbols and associated values. <p> Yacc uses a parsing technique known as LR-parsing or shift-reduce parsing. LR parsing is a @@ -1396,62 +1475,78 @@ Whenever a valid right-hand-side is found in the input, the appropriate action c grammar symbols are replaced by the grammar symbol on the left-hand-side. <p> -LR parsing is commonly implemented by shifting grammar symbols onto a stack and looking at the stack and the next -input token for patterns. The details of the algorithm can be found in a compiler text, but the -following example illustrates the steps that are performed if you wanted to parse the expression -<tt>3 + 5 * (10 - 20)</tt> using the grammar defined above: +LR parsing is commonly implemented by shifting grammar symbols onto a +stack and looking at the stack and the next input token for patterns that +match one of the grammar rules. +The details of the algorithm can be found in a compiler textbook, but the +following example illustrates the steps that are performed if you +wanted to parse the expression +<tt>3 + 5 * (10 - 20)</tt> using the grammar defined above. In the example, +the special symbol <tt>$</tt> represents the end of input. + <blockquote> <pre> Step Symbol Stack Input Tokens Action ---- --------------------- --------------------- ------------------------------- -1 $ 3 + 5 * ( 10 - 20 )$ Shift 3 -2 $ 3 + 5 * ( 10 - 20 )$ Reduce factor : NUMBER -3 $ factor + 5 * ( 10 - 20 )$ Reduce term : factor -4 $ term + 5 * ( 10 - 20 )$ Reduce expr : term -5 $ expr + 5 * ( 10 - 20 )$ Shift + -6 $ expr + 5 * ( 10 - 20 )$ Shift 5 -7 $ expr + 5 * ( 10 - 20 )$ Reduce factor : NUMBER -8 $ expr + factor * ( 10 - 20 )$ Reduce term : factor -9 $ expr + term * ( 10 - 20 )$ Shift * -10 $ expr + term * ( 10 - 20 )$ Shift ( -11 $ expr + term * ( 10 - 20 )$ Shift 10 -12 $ expr + term * ( 10 - 20 )$ Reduce factor : NUMBER -13 $ expr + term * ( factor - 20 )$ Reduce term : factor -14 $ expr + term * ( term - 20 )$ Reduce expr : term -15 $ expr + term * ( expr - 20 )$ Shift - -16 $ expr + term * ( expr - 20 )$ Shift 20 -17 $ expr + term * ( expr - 20 )$ Reduce factor : NUMBER -18 $ expr + term * ( expr - factor )$ Reduce term : factor -19 $ expr + term * ( expr - term )$ Reduce expr : expr - term -20 $ expr + term * ( expr )$ Shift ) -21 $ expr + term * ( expr ) $ Reduce factor : (expr) -22 $ expr + term * factor $ Reduce term : term * factor -23 $ expr + term $ Reduce expr : expr + term -24 $ expr $ Reduce expr -25 $ $ Success! -</pre> -</blockquote> - -When parsing the expression, an underlying state machine and the current input token determine what to do next. -If the next token looks like part of a valid grammar rule (based on other items on the stack), it is generally shifted -onto the stack. If the top of the stack contains a valid right-hand-side of a grammar rule, it is -usually "reduced" and the symbols replaced with the symbol on the left-hand-side. When this reduction occurs, the -appropriate action is triggered (if defined). If the input token can't be shifted and the top of stack doesn't match -any grammar rules, a syntax error has occurred and the parser must take some kind of recovery step (or bail out). - -<p> -It is important to note that the underlying implementation is built around a large finite-state machine that is encoded -in a collection of tables. The construction of these tables is quite complicated and beyond the scope of this discussion. -However, subtle details of this process explain why, in the example above, the parser chooses to shift a token -onto the stack in step 9 rather than reducing the rule <tt>expr : expr + term</tt>. - -<H2><a name="ply_nn23"></a>5. Yacc reference</H2> - - -This section describes how to use write parsers in PLY. - -<H3><a name="ply_nn24"></a>5.1 An example</H3> +1 3 + 5 * ( 10 - 20 )$ Shift 3 +2 3 + 5 * ( 10 - 20 )$ Reduce factor : NUMBER +3 factor + 5 * ( 10 - 20 )$ Reduce term : factor +4 term + 5 * ( 10 - 20 )$ Reduce expr : term +5 expr + 5 * ( 10 - 20 )$ Shift + +6 expr + 5 * ( 10 - 20 )$ Shift 5 +7 expr + 5 * ( 10 - 20 )$ Reduce factor : NUMBER +8 expr + factor * ( 10 - 20 )$ Reduce term : factor +9 expr + term * ( 10 - 20 )$ Shift * +10 expr + term * ( 10 - 20 )$ Shift ( +11 expr + term * ( 10 - 20 )$ Shift 10 +12 expr + term * ( 10 - 20 )$ Reduce factor : NUMBER +13 expr + term * ( factor - 20 )$ Reduce term : factor +14 expr + term * ( term - 20 )$ Reduce expr : term +15 expr + term * ( expr - 20 )$ Shift - +16 expr + term * ( expr - 20 )$ Shift 20 +17 expr + term * ( expr - 20 )$ Reduce factor : NUMBER +18 expr + term * ( expr - factor )$ Reduce term : factor +19 expr + term * ( expr - term )$ Reduce expr : expr - term +20 expr + term * ( expr )$ Shift ) +21 expr + term * ( expr ) $ Reduce factor : (expr) +22 expr + term * factor $ Reduce term : term * factor +23 expr + term $ Reduce expr : expr + term +24 expr $ Reduce expr +25 $ Success! +</pre> +</blockquote> + +When parsing the expression, an underlying state machine and the +current input token determine what happens next. If the next token +looks like part of a valid grammar rule (based on other items on the +stack), it is generally shifted onto the stack. If the top of the +stack contains a valid right-hand-side of a grammar rule, it is +usually "reduced" and the symbols replaced with the symbol on the +left-hand-side. When this reduction occurs, the appropriate action is +triggered (if defined). If the input token can't be shifted and the +top of stack doesn't match any grammar rules, a syntax error has +occurred and the parser must take some kind of recovery step (or bail +out). A parse is only successful if the parser reaches a state where +the symbol stack is empty and there are no more input tokens. + +<p> +It is important to note that the underlying implementation is built +around a large finite-state machine that is encoded in a collection of +tables. The construction of these tables is non-trivial and +beyond the scope of this discussion. However, subtle details of this +process explain why, in the example above, the parser chooses to shift +a token onto the stack in step 9 rather than reducing the +rule <tt>expr : expr + term</tt>. + +<H2><a name="ply_nn23"></a>6. Yacc</H2> + + +The <tt>ply.yacc</tt> module implements the parsing component of PLY. +The name "yacc" stands for "Yet Another Compiler Compiler" and is +borrowed from the Unix tool of the same name. + +<H3><a name="ply_nn24"></a>6.1 An example</H3> Suppose you wanted to make a grammar for simple arithmetic expressions as previously described. Here is @@ -1503,26 +1598,26 @@ def p_error(p): print "Syntax error in input!" # Build the parser -yacc.yacc() - -# Use this if you want to build the parser using SLR instead of LALR -# yacc.yacc(method="SLR") +parser = yacc.yacc() -while 1: +while True: try: s = raw_input('calc > ') except EOFError: break if not s: continue - result = yacc.parse(s) + result = parser.parse(s) print result </pre> </blockquote> -In this example, each grammar rule is defined by a Python function where the docstring to that function contains the -appropriate context-free grammar specification. Each function accepts a single -argument <tt>p</tt> that is a sequence containing the values of each grammar symbol in the corresponding rule. The values of -<tt>p[i]</tt> are mapped to grammar symbols as shown here: +In this example, each grammar rule is defined by a Python function +where the docstring to that function contains the appropriate +context-free grammar specification. The statements that make up the +function body implement the semantic actions of the rule. Each function +accepts a single argument <tt>p</tt> that is a sequence containing the +values of each grammar symbol in the corresponding rule. The values +of <tt>p[i]</tt> are mapped to grammar symbols as shown here: <blockquote> <pre> @@ -1535,42 +1630,49 @@ def p_expression_plus(p): </pre> </blockquote> -For tokens, the "value" of the corresponding <tt>p[i]</tt> is the -<em>same</em> as the <tt>p.value</tt> attribute assigned -in the lexer module. For non-terminals, the value is determined by -whatever is placed in <tt>p[0]</tt> when rules are reduced. This -value can be anything at all. However, it probably most common for -the value to be a simple Python type, a tuple, or an instance. In this example, we -are relying on the fact that the <tt>NUMBER</tt> token stores an integer value in its value -field. All of the other rules simply perform various types of integer operations and store -the result. - -<P> -Note: The use of negative indices have a special meaning in yacc---specially <tt>p[-1]</tt> does -not have the same value as <tt>p[3]</tt> in this example. Please see the section on "Embedded Actions" for further -details. - <p> -The first rule defined in the yacc specification determines the starting grammar -symbol (in this case, a rule for <tt>expression</tt> appears first). Whenever -the starting rule is reduced by the parser and no more input is available, parsing -stops and the final value is returned (this value will be whatever the top-most rule -placed in <tt>p[0]</tt>). Note: an alternative starting symbol can be specified using the <tt>start</tt> keyword argument to +For tokens, the "value" of the corresponding <tt>p[i]</tt> is the +<em>same</em> as the <tt>p.value</tt> attribute assigned in the lexer +module. For non-terminals, the value is determined by whatever is +placed in <tt>p[0]</tt> when rules are reduced. This value can be +anything at all. However, it probably most common for the value to be +a simple Python type, a tuple, or an instance. In this example, we +are relying on the fact that the <tt>NUMBER</tt> token stores an +integer value in its value field. All of the other rules simply +perform various types of integer operations and propagate the result. +</p> + +<p> +Note: The use of negative indices have a special meaning in +yacc---specially <tt>p[-1]</tt> does not have the same value +as <tt>p[3]</tt> in this example. Please see the section on "Embedded +Actions" for further details. +</p> + +<p> +The first rule defined in the yacc specification determines the +starting grammar symbol (in this case, a rule for <tt>expression</tt> +appears first). Whenever the starting rule is reduced by the parser +and no more input is available, parsing stops and the final value is +returned (this value will be whatever the top-most rule placed +in <tt>p[0]</tt>). Note: an alternative starting symbol can be +specified using the <tt>start</tt> keyword argument to <tt>yacc()</tt>. -<p>The <tt>p_error(p)</tt> rule is defined to catch syntax errors. See the error handling section -below for more detail. +<p>The <tt>p_error(p)</tt> rule is defined to catch syntax errors. +See the error handling section below for more detail. <p> -To build the parser, call the <tt>yacc.yacc()</tt> function. This function -looks at the module and attempts to construct all of the LR parsing tables for the grammar -you have specified. The first time <tt>yacc.yacc()</tt> is invoked, you will get a message -such as this: +To build the parser, call the <tt>yacc.yacc()</tt> function. This +function looks at the module and attempts to construct all of the LR +parsing tables for the grammar you have specified. The first +time <tt>yacc.yacc()</tt> is invoked, you will get a message such as +this: <blockquote> <pre> $ python calcparse.py -yacc: Generating LALR parsing table... +Generating LALR tables calc > </pre> </blockquote> @@ -1582,7 +1684,8 @@ debugging file called <tt>parser.out</tt> is created. On subsequent executions, <tt>yacc</tt> will reload the table from <tt>parsetab.py</tt> unless it has detected a change in the underlying grammar (in which case the tables and <tt>parsetab.py</tt> file are -regenerated). Note: The names of parser output files can be changed if necessary. See the notes that follow later. +regenerated). Note: The names of parser output files can be changed +if necessary. See the <a href="reference.html">PLY Reference</a> for details. <p> If any errors are detected in your grammar specification, <tt>yacc.py</tt> will produce @@ -1597,9 +1700,18 @@ diagnostic messages and possibly raise an exception. Some of the errors that ca <li>Undefined rules and tokens </ul> -The next few sections now discuss a few finer points of grammar construction. +The next few sections discuss grammar specification in more detail. -<H3><a name="ply_nn25"></a>5.2 Combining Grammar Rule Functions</H3> +<p> +The final part of the example shows how to actually run the parser +created by +<tt>yacc()</tt>. To run the parser, you simply have to call +the <tt>parse()</tt> with a string of input text. This will run all +of the grammar rules and return the result of the entire parse. This +result return is the value assigned to <tt>p[0]</tt> in the starting +grammar rule. + +<H3><a name="ply_nn25"></a>6.2 Combining Grammar Rule Functions</H3> When grammar rules are similar, they can be combined into a single function. @@ -1668,7 +1780,15 @@ def p_expressions(p): </pre> </blockquote> -<H3><a name="ply_nn26"></a>5.3 Character Literals</H3> +If parsing performance is a concern, you should resist the urge to put +too much conditional processing into a single grammar rule as shown in +these examples. When you add checks to see which grammar rule is +being handled, you are actually duplicating the work that the parser +has already performed (i.e., the parser already knows exactly what rule it +matched). You can eliminate this overhead by using a +separate <tt>p_rule()</tt> function for each grammar rule. + +<H3><a name="ply_nn26"></a>6.3 Character Literals</H3> If desired, a grammar may contain tokens defined as single character literals. For example: @@ -1704,7 +1824,7 @@ literals = ['+','-','*','/' ] <b>Character literals are limited to a single character</b>. Thus, it is not legal to specify literals such as <tt>'<='</tt> or <tt>'=='</tt>. For this, use the normal lexing rules (e.g., define a rule such as <tt>t_EQ = r'=='</tt>). -<H3><a name="ply_nn26"></a>5.4 Empty Productions</H3> +<H3><a name="ply_nn26"></a>6.4 Empty Productions</H3> <tt>yacc.py</tt> can handle empty productions by defining a rule like this: @@ -1728,10 +1848,12 @@ def p_optitem(p): </pre> </blockquote> -Note: You can write empty rules anywhere by simply specifying an empty right hand side. However, I personally find that -writing an "empty" rule and using "empty" to denote an empty production is easier to read. +Note: You can write empty rules anywhere by simply specifying an empty +right hand side. However, I personally find that writing an "empty" +rule and using "empty" to denote an empty production is easier to read +and more clearly states your intentions. -<H3><a name="ply_nn28"></a>5.5 Changing the starting symbol</H3> +<H3><a name="ply_nn28"></a>6.5 Changing the starting symbol</H3> Normally, the first rule found in a yacc specification defines the starting grammar rule (top level rule). To change this, simply @@ -1751,8 +1873,10 @@ def p_foo(p): </pre> </blockquote> -The use of a <tt>start</tt> specifier may be useful during debugging since you can use it to have yacc build a subset of -a larger grammar. For this purpose, it is also possible to specify a starting symbol as an argument to <tt>yacc()</tt>. For example: +The use of a <tt>start</tt> specifier may be useful during debugging +since you can use it to have yacc build a subset of a larger grammar. +For this purpose, it is also possible to specify a starting symbol as +an argument to <tt>yacc()</tt>. For example: <blockquote> <pre> @@ -1760,12 +1884,14 @@ yacc.yacc(start='foo') </pre> </blockquote> -<H3><a name="ply_nn27"></a>5.6 Dealing With Ambiguous Grammars</H3> +<H3><a name="ply_nn27"></a>6.6 Dealing With Ambiguous Grammars</H3> -The expression grammar given in the earlier example has been written in a special format to eliminate ambiguity. -However, in many situations, it is extremely difficult or awkward to write grammars in this format. A -much more natural way to express the grammar is in a more compact form like this: +The expression grammar given in the earlier example has been written +in a special format to eliminate ambiguity. However, in many +situations, it is extremely difficult or awkward to write grammars in +this format. A much more natural way to express the grammar is in a +more compact form like this: <blockquote> <pre> @@ -1778,15 +1904,18 @@ expression : expression PLUS expression </pre> </blockquote> -Unfortunately, this grammar specification is ambiguous. For example, if you are parsing the string -"3 * 4 + 5", there is no way to tell how the operators are supposed to be grouped. -For example, does the expression mean "(3 * 4) + 5" or is it "3 * (4+5)"? +Unfortunately, this grammar specification is ambiguous. For example, +if you are parsing the string "3 * 4 + 5", there is no way to tell how +the operators are supposed to be grouped. For example, does the +expression mean "(3 * 4) + 5" or is it "3 * (4+5)"? <p> -When an ambiguous grammar is given to <tt>yacc.py</tt> it will print messages about "shift/reduce conflicts" -or a "reduce/reduce conflicts". A shift/reduce conflict is caused when the parser generator can't decide -whether or not to reduce a rule or shift a symbol on the parsing stack. For example, consider -the string "3 * 4 + 5" and the internal parsing stack: +When an ambiguous grammar is given to <tt>yacc.py</tt> it will print +messages about "shift/reduce conflicts" or "reduce/reduce conflicts". +A shift/reduce conflict is caused when the parser generator can't +decide whether or not to reduce a rule or shift a symbol on the +parsing stack. For example, consider the string "3 * 4 + 5" and the +internal parsing stack: <blockquote> <pre> @@ -1801,20 +1930,25 @@ Step Symbol Stack Input Tokens Action </pre> </blockquote> -In this case, when the parser reaches step 6, it has two options. One is to reduce the -rule <tt>expr : expr * expr</tt> on the stack. The other option is to shift the -token <tt>+</tt> on the stack. Both options are perfectly legal from the rules -of the context-free-grammar. +In this case, when the parser reaches step 6, it has two options. One +is to reduce the rule <tt>expr : expr * expr</tt> on the stack. The +other option is to shift the token <tt>+</tt> on the stack. Both +options are perfectly legal from the rules of the +context-free-grammar. <p> -By default, all shift/reduce conflicts are resolved in favor of shifting. Therefore, in the above -example, the parser will always shift the <tt>+</tt> instead of reducing. Although this -strategy works in many cases (including the ambiguous if-then-else), it is not enough for arithmetic -expressions. In fact, in the above example, the decision to shift <tt>+</tt> is completely wrong---we should have -reduced <tt>expr * expr</tt> since multiplication has higher mathematical precedence than addition. +By default, all shift/reduce conflicts are resolved in favor of +shifting. Therefore, in the above example, the parser will always +shift the <tt>+</tt> instead of reducing. Although this strategy +works in many cases (for example, the case of +"if-then" versus "if-then-else"), it is not enough for arithmetic expressions. In fact, +in the above example, the decision to shift <tt>+</tt> is completely +wrong---we should have reduced <tt>expr * expr</tt> since +multiplication has higher mathematical precedence than addition. -<p>To resolve ambiguity, especially in expression grammars, <tt>yacc.py</tt> allows individual -tokens to be assigned a precedence level and associativity. This is done by adding a variable +<p>To resolve ambiguity, especially in expression +grammars, <tt>yacc.py</tt> allows individual tokens to be assigned a +precedence level and associativity. This is done by adding a variable <tt>precedence</tt> to the grammar file like this: <blockquote> @@ -1826,17 +1960,19 @@ precedence = ( </pre> </blockquote> -This declaration specifies that <tt>PLUS</tt>/<tt>MINUS</tt> have -the same precedence level and are left-associative and that -<tt>TIMES</tt>/<tt>DIVIDE</tt> have the same precedence and are left-associative. -Within the <tt>precedence</tt> declaration, tokens are ordered from lowest to highest precedence. Thus, -this declaration specifies that <tt>TIMES</tt>/<tt>DIVIDE</tt> have higher -precedence than <tt>PLUS</tt>/<tt>MINUS</tt> (since they appear later in the +This declaration specifies that <tt>PLUS</tt>/<tt>MINUS</tt> have the +same precedence level and are left-associative and that +<tt>TIMES</tt>/<tt>DIVIDE</tt> have the same precedence and are +left-associative. Within the <tt>precedence</tt> declaration, tokens +are ordered from lowest to highest precedence. Thus, this declaration +specifies that <tt>TIMES</tt>/<tt>DIVIDE</tt> have higher precedence +than <tt>PLUS</tt>/<tt>MINUS</tt> (since they appear later in the precedence specification). <p> -The precedence specification works by associating a numerical precedence level value and associativity direction to -the listed tokens. For example, in the above example you get: +The precedence specification works by associating a numerical +precedence level value and associativity direction to the listed +tokens. For example, in the above example you get: <blockquote> <pre> @@ -1847,9 +1983,10 @@ DIVIDE : level = 2, assoc = 'left' </pre> </blockquote> -These values are then used to attach a numerical precedence value and associativity direction -to each grammar rule. <em>This is always determined by looking at the precedence of the right-most terminal symbol.</em> -For example: +These values are then used to attach a numerical precedence value and +associativity direction to each grammar rule. <em>This is always +determined by looking at the precedence of the right-most terminal +symbol.</em> For example: <blockquote> <pre> @@ -1867,7 +2004,7 @@ looking at the precedence rules and associativity specifiers. <p> <ol> -<li>If the current token has higher precedence, it is shifted. +<li>If the current token has higher precedence than the rule on the stack, it is shifted. <li>If the grammar rule on the stack has higher precedence, the rule is reduced. <li>If the current token and the grammar rule have the same precedence, the rule is reduced for left associativity, whereas the token is shifted for right associativity. @@ -1875,21 +2012,28 @@ rule is reduced for left associativity, whereas the token is shifted for right a favor of shifting (the default). </ol> -For example, if "expression PLUS expression" has been parsed and the next token -is "TIMES", the action is going to be a shift because "TIMES" has a higher precedence level than "PLUS". On the other -hand, if "expression TIMES expression" has been parsed and the next token is "PLUS", the action -is going to be reduce because "PLUS" has a lower precedence than "TIMES." +For example, if "expression PLUS expression" has been parsed and the +next token is "TIMES", the action is going to be a shift because +"TIMES" has a higher precedence level than "PLUS". On the other hand, +if "expression TIMES expression" has been parsed and the next token is +"PLUS", the action is going to be reduce because "PLUS" has a lower +precedence than "TIMES." <p> -When shift/reduce conflicts are resolved using the first three techniques (with the help of -precedence rules), <tt>yacc.py</tt> will report no errors or conflicts in the grammar. +When shift/reduce conflicts are resolved using the first three +techniques (with the help of precedence rules), <tt>yacc.py</tt> will +report no errors or conflicts in the grammar (although it will print +some information in the <tt>parser.out</tt> debugging file). <p> -One problem with the precedence specifier technique is that it is sometimes necessary to -change the precedence of an operator in certain contents. For example, consider a unary-minus operator -in "3 + 4 * -5". Normally, unary minus has a very high precedence--being evaluated before the multiply. -However, in our precedence specifier, MINUS has a lower precedence than TIMES. To deal with this, -precedence rules can be given for fictitious tokens like this: +One problem with the precedence specifier technique is that it is +sometimes necessary to change the precedence of an operator in certain +contexts. For example, consider a unary-minus operator in "3 + 4 * +-5". Mathematically, the unary minus is normally given a very high +precedence--being evaluated before the multiply. However, in our +precedence specifier, MINUS has a lower precedence than TIMES. To +deal with this, precedence rules can be given for so-called "fictitious tokens" +like this: <blockquote> <pre> @@ -1978,11 +2122,27 @@ whether it's supposed to reduce the 5 as an expression and then reduce the rule <tt>assignment : ID EQUALS expression</tt>. <p> -It should be noted that reduce/reduce conflicts are notoriously difficult to spot -simply looking at the input grammer. To locate these, it is usually easier to look at the -<tt>parser.out</tt> debugging file with an appropriately high level of caffeination. +It should be noted that reduce/reduce conflicts are notoriously +difficult to spot simply looking at the input grammer. When a +reduce/reduce conflict occurs, <tt>yacc()</tt> will try to help by +printing a warning message such as this: + +<blockquote> +<pre> +WARNING: 1 reduce/reduce conflict +WARNING: reduce/reduce conflict in state 15 resolved using rule (assignment -> ID EQUALS NUMBER) +WARNING: rejected rule (expression -> NUMBER) +</pre> +</blockquote> + +This message identifies the two rules that are in conflict. However, +it may not tell you how the parser arrived at such a state. To try +and figure it out, you'll probably have to look at your grammar and +the contents of the +<tt>parser.out</tt> debugging file with an appropriately high level of +caffeination. -<H3><a name="ply_nn28"></a>5.7 The parser.out file</H3> +<H3><a name="ply_nn28"></a>6.7 The parser.out file</H3> Tracking down shift/reduce and reduce/reduce conflicts is one of the finer pleasures of using an LR @@ -2240,10 +2400,15 @@ state 13 </pre> </blockquote> -In the file, each state of the grammar is described. Within each state the "." indicates the current -location of the parse within any applicable grammar rules. In addition, the actions for each valid -input token are listed. When a shift/reduce or reduce/reduce conflict arises, rules <em>not</em> selected -are prefixed with an !. For example: +The different states that appear in this file are a representation of +every possible sequence of valid input tokens allowed by the grammar. +When receiving input tokens, the parser is building up a stack and +looking for matching rules. Each state keeps track of the grammar +rules that might be in the process of being matched at that point. Within each +rule, the "." character indicates the current location of the parse +within that rule. In addition, the actions for each valid input token +are listed. When a shift/reduce or reduce/reduce conflict arises, +rules <em>not</em> selected are prefixed with an !. For example: <blockquote> <pre> @@ -2258,12 +2423,22 @@ By looking at these rules (and with a little practice), you can usually track do of most parsing conflicts. It should also be stressed that not all shift-reduce conflicts are bad. However, the only way to be sure that they are resolved correctly is to look at <tt>parser.out</tt>. -<H3><a name="ply_nn29"></a>5.8 Syntax Error Handling</H3> +<H3><a name="ply_nn29"></a>6.8 Syntax Error Handling</H3> -When a syntax error occurs during parsing, the error is immediately +If you are creating a parser for production use, the handling of +syntax errors is important. As a general rule, you don't want a +parser to simply throw up its hands and stop at the first sign of +trouble. Instead, you want it to report the error, recover if possible, and +continue parsing so that all of the errors in the input get reported +to the user at once. This is the standard behavior found in compilers +for languages such as C, C++, and Java. + +In PLY, when a syntax error occurs during parsing, the error is immediately detected (i.e., the parser does not read any more tokens beyond the -source of the error). Error recovery in LR parsers is a delicate +source of the error). However, at this point, the parser enters a +recovery mode that can be used to try and continue further parsing. +As a general rule, error recovery in LR parsers is a delicate topic that involves ancient rituals and black-magic. The recovery mechanism provided by <tt>yacc.py</tt> is comparable to Unix yacc so you may want consult a book like O'Reilly's "Lex and Yacc" for some of the finer details. @@ -2273,7 +2448,9 @@ When a syntax error occurs, <tt>yacc.py</tt> performs the following steps: <ol> <li>On the first occurrence of an error, the user-defined <tt>p_error()</tt> function -is called with the offending token as an argument. Afterwards, the parser enters +is called with the offending token as an argument. However, if the syntax error is due to +reaching the end-of-file, <tt>p_error()</tt> is called with an argument of <tt>None</tt>. +Afterwards, the parser enters an "error-recovery" mode in which it will not make future calls to <tt>p_error()</tt> until it has successfully shifted at least 3 tokens onto the parsing stack. @@ -2298,7 +2475,7 @@ shifted onto the parsing stack. parser can successfully shift a new symbol or reduce a rule involving <tt>error</tt>. </ol> -<H4><a name="ply_nn30"></a>5.8.1 Recovery and resynchronization with error rules</H4> +<H4><a name="ply_nn30"></a>6.8.1 Recovery and resynchronization with error rules</H4> The most well-behaved approach for handling syntax errors is to write grammar rules that include the <tt>error</tt> @@ -2350,7 +2527,7 @@ This is because the first bad token encountered will cause the rule to be reduced--which may make it difficult to recover if more bad tokens immediately follow. -<H4><a name="ply_nn31"></a>5.8.2 Panic mode recovery</H4> +<H4><a name="ply_nn31"></a>6.8.2 Panic mode recovery</H4> An alternative error recovery scheme is to enter a panic mode recovery in which tokens are @@ -2423,7 +2600,37 @@ def p_error(p): </pre> </blockquote> -<H4><a name="ply_nn32"></a>5.8.3 General comments on error handling</H4> +<H4><a name="ply_nn35"></a>6.8.3 Signaling an error from a production</H4> + + +If necessary, a production rule can manually force the parser to enter error recovery. This +is done by raising the <tt>SyntaxError</tt> exception like this: + +<blockquote> +<pre> +def p_production(p): + 'production : some production ...' + raise SyntaxError +</pre> +</blockquote> + +The effect of raising <tt>SyntaxError</tt> is the same as if the last symbol shifted onto the +parsing stack was actually a syntax error. Thus, when you do this, the last symbol shifted is popped off +of the parsing stack and the current lookahead token is set to an <tt>error</tt> token. The parser +then enters error-recovery mode where it tries to reduce rules that can accept <tt>error</tt> tokens. +The steps that follow from this point are exactly the same as if a syntax error were detected and +<tt>p_error()</tt> were called. + +<P> +One important aspect of manually setting an error is that the <tt>p_error()</tt> function will <b>NOT</b> be +called in this case. If you need to issue an error message, make sure you do it in the production that +raises <tt>SyntaxError</tt>. + +<P> +Note: This feature of PLY is meant to mimic the behavior of the YYERROR macro in yacc. + + +<H4><a name="ply_nn32"></a>6.8.4 General comments on error handling</H4> For normal types of languages, error recovery with error rules and resynchronization characters is probably the most reliable @@ -2431,10 +2638,12 @@ technique. This is because you can instrument the grammar to catch errors at sel to recover and continue parsing. Panic mode recovery is really only useful in certain specialized applications where you might want to discard huge portions of the input text to find a valid restart point. -<H3><a name="ply_nn33"></a>5.9 Line Number and Position Tracking</H3> +<H3><a name="ply_nn33"></a>6.9 Line Number and Position Tracking</H3> + -Position tracking is often a tricky problem when writing compilers. By default, PLY tracks the line number and position of -all tokens. This information is available using the following functions: +Position tracking is often a tricky problem when writing compilers. +By default, PLY tracks the line number and position of all tokens. +This information is available using the following functions: <ul> <li><tt>p.lineno(num)</tt>. Return the line number for symbol <em>num</em> @@ -2452,9 +2661,11 @@ def p_expression(p): </pre> </blockquote> -As an optional feature, <tt>yacc.py</tt> can automatically track line numbers and positions for all of the grammar symbols -as well. However, this -extra tracking requires extra processing and can significantly slow down parsing. Therefore, it must be enabled by passing the +As an optional feature, <tt>yacc.py</tt> can automatically track line +numbers and positions for all of the grammar symbols as well. +However, this extra tracking requires extra processing and can +significantly slow down parsing. Therefore, it must be enabled by +passing the <tt>tracking=True</tt> option to <tt>yacc.parse()</tt>. For example: <blockquote> @@ -2463,8 +2674,9 @@ yacc.parse(data,tracking=True) </pre> </blockquote> -Once enabled, the <tt>lineno()</tt> and <tt>lexpos()</tt> methods work for all grammar symbols. In addition, two -additional methods can be used: +Once enabled, the <tt>lineno()</tt> and <tt>lexpos()</tt> methods work +for all grammar symbols. In addition, two additional methods can be +used: <ul> <li><tt>p.linespan(num)</tt>. Return a tuple (startline,endline) with the starting and ending line number for symbol <em>num</em>. @@ -2506,29 +2718,59 @@ def p_bad_func(p): </blockquote> <p> -Similarly, you may get better parsing performance if you only propagate line number -information where it's needed. For example: +Similarly, you may get better parsing performance if you only +selectively propagate line number information where it's needed using +the <tt>p.set_lineno()</tt> method. For example: <blockquote> <pre> def p_fname(p): 'fname : ID' - p[0] = (p[1],p.lineno(1)) + p[0] = p[1] + p.set_lineno(0,p.lineno(1)) </pre> </blockquote> -Finally, it should be noted that PLY does not store position information after a rule has been -processed. If it is important for you to retain this information in an abstract syntax tree, you -must make your own copy. +PLY doesn't retain line number information from rules that have already been +parsed. If you are building an abstract syntax tree and need to have line numbers, +you should make sure that the line numbers appear in the tree itself. -<H3><a name="ply_nn34"></a>5.10 AST Construction</H3> +<H3><a name="ply_nn34"></a>6.10 AST Construction</H3> -<tt>yacc.py</tt> provides no special functions for constructing an abstract syntax tree. However, such -construction is easy enough to do on your own. Simply create a data structure for abstract syntax tree nodes -and assign nodes to <tt>p[0]</tt> in each rule. +<tt>yacc.py</tt> provides no special functions for constructing an +abstract syntax tree. However, such construction is easy enough to do +on your own. -For example: +<p>A minimal way to construct a tree is to simply create and +propagate a tuple or list in each grammar rule function. There +are many possible ways to do this, but one example would be something +like this: + +<blockquote> +<pre> +def p_expression_binop(p): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + + p[0] = ('binary-expression',p[2],p[1],p[3]) + +def p_expression_group(p): + 'expression : LPAREN expression RPAREN' + p[0] = ('group-expression',p[2]) + +def p_expression_number(p): + 'expression : NUMBER' + p[0] = ('number-expression',p[1]) +</pre> +</blockquote> + +<p> +Another approach is to create a set of data structure for different +kinds of abstract syntax tree nodes and assign nodes to <tt>p[0]</tt> +in each rule. For example: <blockquote> <pre> @@ -2564,8 +2806,12 @@ def p_expression_number(p): </pre> </blockquote> -To simplify tree traversal, it may make sense to pick a very generic tree structure for your parse tree nodes. -For example: +The advantage to this approach is that it may make it easier to attach more complicated +semantics, type checking, code generation, and other features to the node classes. + +<p> +To simplify tree traversal, it may make sense to pick a very generic +tree structure for your parse tree nodes. For example: <blockquote> <pre> @@ -2588,7 +2834,7 @@ def p_expression_binop(p): </pre> </blockquote> -<H3><a name="ply_nn35"></a>5.11 Embedded Actions</H3> +<H3><a name="ply_nn35"></a>6.11 Embedded Actions</H3> The parsing technique used by yacc only allows actions to be executed at the end of a rule. For example, @@ -2608,7 +2854,7 @@ symbols <tt>A</tt>, <tt>B</tt>, <tt>C</tt>, and <tt>D</tt> have been parsed. Sometimes, however, it is useful to execute small code fragments during intermediate stages of parsing. For example, suppose you wanted to perform some action immediately after <tt>A</tt> has -been parsed. To do this, you can write a empty rule like this: +been parsed. To do this, write an empty rule like this: <blockquote> <pre> @@ -2671,8 +2917,11 @@ def p_seen_AB(p): </pre> </blockquote> -an extra shift-reduce conflict will be introduced. This conflict is caused by the fact that the same symbol <tt>C</tt> appears next in -both the <tt>abcd</tt> and <tt>abcx</tt> rules. The parser can either shift the symbol (<tt>abcd</tt> rule) or reduce the empty rule <tt>seen_AB</tt> (<tt>abcx</tt> rule). +an extra shift-reduce conflict will be introduced. This conflict is +caused by the fact that the same symbol <tt>C</tt> appears next in +both the <tt>abcd</tt> and <tt>abcx</tt> rules. The parser can either +shift the symbol (<tt>abcd</tt> rule) or reduce the empty +rule <tt>seen_AB</tt> (<tt>abcx</tt> rule). <p> A common use of embedded rules is to control other aspects of parsing @@ -2696,10 +2945,14 @@ def p_new_scope(p): </pre> </blockquote> -In this case, the embedded action <tt>new_scope</tt> executes immediately after a <tt>LBRACE</tt> (<tt>{</tt>) symbol is parsed. This might -adjust internal symbol tables and other aspects of the parser. Upon completion of the rule <tt>statements_block</tt>, code might undo the operations performed in the embedded action (e.g., <tt>pop_scope()</tt>). +In this case, the embedded action <tt>new_scope</tt> executes +immediately after a <tt>LBRACE</tt> (<tt>{</tt>) symbol is parsed. +This might adjust internal symbol tables and other aspects of the +parser. Upon completion of the rule <tt>statements_block</tt>, code +might undo the operations performed in the embedded action +(e.g., <tt>pop_scope()</tt>). -<H3><a name="ply_nn36"></a>5.12 Yacc implementation notes</H3> +<H3><a name="ply_nn36"></a>6.12 Miscellaneous Yacc Notes</H3> <ul> @@ -2770,16 +3023,7 @@ each time it runs (which may take awhile depending on how large your grammar is) <blockquote> <pre> -yacc.parse(debug=1) -</pre> -</blockquote> - -<p> -<li>To redirect the debugging output to a filename of your choosing, use: - -<blockquote> -<pre> -yacc.parse(debug=1, debugfile="debugging.out") +yacc.parse(debug=1) </pre> </blockquote> @@ -2812,17 +3056,17 @@ machine. Please be patient. size of the grammar. The biggest bottlenecks will be the lexer and the complexity of the code in your grammar rules. </ul> -<H2><a name="ply_nn37"></a>6. Parser and Lexer State Management</H2> +<H2><a name="ply_nn37"></a>7. Multiple Parsers and Lexers</H2> In advanced parsing applications, you may want to have multiple -parsers and lexers. Furthermore, the parser may want to control the -behavior of the lexer in some way. +parsers and lexers. <p> -To do this, it is important to note that both the lexer and parser are -actually implemented as objects. These objects are returned by the -<tt>lex()</tt> and <tt>yacc()</tt> functions respectively. For example: +As a general rules this isn't a problem. However, to make it work, +you need to carefully make sure everything gets hooked up correctly. +First, make sure you save the objects returned by <tt>lex()</tt> and +<tt>yacc()</tt>. For example: <blockquote> <pre> @@ -2831,7 +3075,8 @@ parser = yacc.yacc() # Return parser object </pre> </blockquote> -To attach the lexer and parser together, make sure you use the <tt>lexer</tt> argumemnt to parse. For example: +Next, when parsing, make sure you give the <tt>parse()</tt> function a reference to the lexer it +should be using. For example: <blockquote> <pre> @@ -2839,8 +3084,13 @@ parser.parse(text,lexer=lexer) </pre> </blockquote> -Within lexer and parser rules, these objects are also available. In the lexer, -the "lexer" attribute of a token refers to the lexer object in use. For example: +If you forget to do this, the parser will use the last lexer +created--which is not always what you want. + +<p> +Within lexer and parser rule functions, these objects are also +available. In the lexer, the "lexer" attribute of a token refers to +the lexer object that triggered the rule. For example: <blockquote> <pre> @@ -2868,7 +3118,7 @@ If necessary, arbitrary attributes can be attached to the lexer or parser object For example, if you wanted to have different parsing modes, you could attach a mode attribute to the parser object and look at it later. -<H2><a name="ply_nn38"></a>7. Using Python's Optimized Mode</H2> +<H2><a name="ply_nn38"></a>8. Using Python's Optimized Mode</H2> Because PLY uses information from doc-strings, parsing and lexing @@ -2891,9 +3141,110 @@ the tables without the need for doc strings. <p> Beware: running PLY in optimized mode disables a lot of error checking. You should only do this when your project has stabilized -and you don't need to do any debugging. - -<H2><a name="ply_nn39"></a>8. Where to go from here?</H2> +and you don't need to do any debugging. One of the purposes of +optimized mode is to substantially decrease the startup time of +your compiler (by assuming that everything is already properly +specified and works). + +<H2><a name="ply_nn44"></a>9. Advanced Debugging</H2> + + +<p> +Debugging a compiler is typically not an easy task. PLY provides some +advanced diagonistic capabilities through the use of Python's +<tt>logging</tt> module. The next two sections describe this: + +<H3><a name="ply_nn45"></a>9.1 Debugging the lex() and yacc() commands</H3> + + +<p> +Both the <tt>lex()</tt> and <tt>yacc()</tt> commands have a debugging +mode that can be enabled using the <tt>debug</tt> flag. For example: + +<blockquote> +<pre> +lex.lex(debug=True) +yacc.yacc(debug=True) +</pre> +</blockquote> + +Normally, the output produced by debugging is routed to either +standard error or, in the case of <tt>yacc()</tt>, to a file +<tt>parser.out</tt>. This output can be more carefully controlled +by supplying a logging object. Here is an example that adds +information about where different debugging messages are coming from: + +<blockquote> +<pre> +# Set up a logging object +import logging +logging.basicConfig( + level = logging.DEBUG, + filename = "parselog.txt", + filemode = "w", + format = "%(filename)10s:%(lineno)4d:%(message)s" +) +log = logging.getLogger() + +lex.lex(debug=True,debuglog=log) +yacc.yacc(debug=True,debuglog=log) +</pre> +</blockquote> + +If you supply a custom logger, the amount of debugging +information produced can be controlled by setting the logging level. +Typically, debugging messages are either issued at the <tt>DEBUG</tt>, +<tt>INFO</tt>, or <tt>WARNING</tt> levels. + +<p> +PLY's error messages and warnings are also produced using the logging +interface. This can be controlled by passing a logging object +using the <tt>errorlog</tt> parameter. + +<blockquote> +<pre> +lex.lex(errorlog=log) +yacc.yacc(errorlog=log) +</pre> +</blockquote> + +If you want to completely silence warnings, you can either pass in a +logging object with an appropriate filter level or use the <tt>NullLogger</tt> +object defined in either <tt>lex</tt> or <tt>yacc</tt>. For example: + +<blockquote> +<pre> +yacc.yacc(errorlog=yacc.NullLogger()) +</pre> +</blockquote> + +<H3><a name="ply_nn46"></a>9.2 Run-time Debugging</H3> + + +<p> +To enable run-time debugging of a parser, use the <tt>debug</tt> option to parse. This +option can either be an integer (which simply turns debugging on or off) or an instance +of a logger object. For example: + +<blockquote> +<pre> +log = logging.getLogger() +parser.parse(input,debug=log) +</pre> +</blockquote> + +If a logging object is passed, you can use its filtering level to control how much +output gets generated. The <tt>INFO</tt> level is used to produce information +about rule reductions. The <tt>DEBUG</tt> level will show information about the +parsing stack, token shifts, and other details. The <tt>ERROR</tt> level shows information +related to parsing errors. + +<p> +For very complicated problems, you should pass in a logging object that +redirects to a file where you can more easily inspect the output after +execution. + +<H2><a name="ply_nn39"></a>10. Where to go from here?</H2> The <tt>examples</tt> directory of the PLY distribution contains several simple examples. Please consult a diff --git a/ext/ply/example/BASIC/basic.py b/ext/ply/example/BASIC/basic.py index 3a07acdbf..b14483d2d 100644 --- a/ext/ply/example/BASIC/basic.py +++ b/ext/ply/example/BASIC/basic.py @@ -4,6 +4,9 @@ import sys sys.path.insert(0,"../..") +if sys.version_info[0] >= 3: + raw_input = input + import basiclex import basparse import basinterp @@ -41,7 +44,7 @@ while 1: prog = basparse.parse(line) if not prog: continue - keys = prog.keys() + keys = list(prog) if keys[0] > 0: b.add_statements(prog) else: @@ -58,8 +61,8 @@ while 1: elif stat[0] == 'NEW': b.new() - - + + diff --git a/ext/ply/example/BASIC/basiclex.py b/ext/ply/example/BASIC/basiclex.py index 727383f2b..3d27cdeeb 100644 --- a/ext/ply/example/BASIC/basiclex.py +++ b/ext/ply/example/BASIC/basiclex.py @@ -25,7 +25,7 @@ def t_ID(t): if t.value in keywords: t.type = t.value return t - + t_EQUALS = r'=' t_PLUS = r'\+' t_MINUS = r'-' @@ -41,7 +41,7 @@ t_GE = r'>=' t_NE = r'<>' t_COMMA = r'\,' t_SEMI = r';' -t_INTEGER = r'\d+' +t_INTEGER = r'\d+' t_FLOAT = r'((\d*\.\d+)(E[\+-]?\d+)?|([1-9]\d*E[\+-]?\d+))' t_STRING = r'\".*?\"' @@ -51,14 +51,10 @@ def t_NEWLINE(t): return t def t_error(t): - print "Illegal character", t.value[0] + print("Illegal character %s" % t.value[0]) t.lexer.skip(1) -lex.lex() - - - - +lex.lex(debug=0) @@ -66,6 +62,10 @@ lex.lex() + + + + diff --git a/ext/ply/example/BASIC/basiclog.py b/ext/ply/example/BASIC/basiclog.py new file mode 100644 index 000000000..ccfd7b967 --- /dev/null +++ b/ext/ply/example/BASIC/basiclog.py @@ -0,0 +1,79 @@ +# An implementation of Dartmouth BASIC (1964) +# + +import sys +sys.path.insert(0,"../..") + +if sys.version_info[0] >= 3: + raw_input = input + +import logging +logging.basicConfig( + level = logging.INFO, + filename = "parselog.txt", + filemode = "w" +) +log = logging.getLogger() + +import basiclex +import basparse +import basinterp + +# If a filename has been specified, we try to run it. +# If a runtime error occurs, we bail out and enter +# interactive mode below +if len(sys.argv) == 2: + data = open(sys.argv[1]).read() + prog = basparse.parse(data,debug=log) + if not prog: raise SystemExit + b = basinterp.BasicInterpreter(prog) + try: + b.run() + raise SystemExit + except RuntimeError: + pass + +else: + b = basinterp.BasicInterpreter({}) + +# Interactive mode. This incrementally adds/deletes statements +# from the program stored in the BasicInterpreter object. In +# addition, special commands 'NEW','LIST',and 'RUN' are added. +# Specifying a line number with no code deletes that line from +# the program. + +while 1: + try: + line = raw_input("[BASIC] ") + except EOFError: + raise SystemExit + if not line: continue + line += "\n" + prog = basparse.parse(line,debug=log) + if not prog: continue + + keys = list(prog) + if keys[0] > 0: + b.add_statements(prog) + else: + stat = prog[keys[0]] + if stat[0] == 'RUN': + try: + b.run() + except RuntimeError: + pass + elif stat[0] == 'LIST': + b.list() + elif stat[0] == 'BLANK': + b.del_line(stat[1]) + elif stat[0] == 'NEW': + b.new() + + + + + + + + + diff --git a/ext/ply/example/BASIC/basinterp.py b/ext/ply/example/BASIC/basinterp.py index 5850457cb..3e8a7774a 100644 --- a/ext/ply/example/BASIC/basinterp.py +++ b/ext/ply/example/BASIC/basinterp.py @@ -40,10 +40,11 @@ class BasicInterpreter: if self.prog[lineno][0] == 'END' and not has_end: has_end = lineno if not has_end: - print "NO END INSTRUCTION" + print("NO END INSTRUCTION") self.error = 1 + return if has_end != lineno: - print "END IS NOT LAST" + print("END IS NOT LAST") self.error = 1 # Check loops @@ -60,9 +61,9 @@ class BasicInterpreter: self.loopend[pc] = i break else: - print "FOR WITHOUT NEXT AT LINE" % self.stat[pc] + print("FOR WITHOUT NEXT AT LINE %s" % self.stat[pc]) self.error = 1 - + # Evaluate an expression def eval(self,expr): etype = expr[0] @@ -79,33 +80,33 @@ class BasicInterpreter: elif etype == 'VAR': var,dim1,dim2 = expr[1] if not dim1 and not dim2: - if self.vars.has_key(var): + if var in self.vars: return self.vars[var] else: - print "UNDEFINED VARIABLE", var, "AT LINE", self.stat[self.pc] + print("UNDEFINED VARIABLE %s AT LINE %s" % (var, self.stat[self.pc])) raise RuntimeError # May be a list lookup or a function evaluation if dim1 and not dim2: - if self.functions.has_key(var): + if var in self.functions: # A function return self.functions[var](dim1) else: # A list evaluation - if self.lists.has_key(var): + if var in self.lists: dim1val = self.eval(dim1) if dim1val < 1 or dim1val > len(self.lists[var]): - print "LIST INDEX OUT OF BOUNDS AT LINE", self.stat[self.pc] + print("LIST INDEX OUT OF BOUNDS AT LINE %s" % self.stat[self.pc]) raise RuntimeError return self.lists[var][dim1val-1] if dim1 and dim2: - if self.tables.has_key(var): + if var in self.tables: dim1val = self.eval(dim1) dim2val = self.eval(dim2) if dim1val < 1 or dim1val > len(self.tables[var]) or dim2val < 1 or dim2val > len(self.tables[var][0]): - print "TABLE INDEX OUT OUT BOUNDS AT LINE", self.stat[self.pc] + print("TABLE INDEX OUT OUT BOUNDS AT LINE %s" % self.stat[self.pc]) raise RuntimeError return self.tables[var][dim1val-1][dim2val-1] - print "UNDEFINED VARIABLE", var, "AT LINE", self.stat[self.pc] + print("UNDEFINED VARIABLE %s AT LINE %s" % (var, self.stat[self.pc])) raise RuntimeError # Evaluate a relational expression @@ -145,31 +146,31 @@ class BasicInterpreter: elif dim1 and not dim2: # List assignment dim1val = self.eval(dim1) - if not self.lists.has_key(var): + if not var in self.lists: self.lists[var] = [0]*10 if dim1val > len(self.lists[var]): - print "DIMENSION TOO LARGE AT LINE", self.stat[self.pc] + print ("DIMENSION TOO LARGE AT LINE %s" % self.stat[self.pc]) raise RuntimeError self.lists[var][dim1val-1] = self.eval(value) elif dim1 and dim2: dim1val = self.eval(dim1) dim2val = self.eval(dim2) - if not self.tables.has_key(var): + if not var in self.tables: temp = [0]*10 v = [] for i in range(10): v.append(temp[:]) self.tables[var] = v # Variable already exists if dim1val > len(self.tables[var]) or dim2val > len(self.tables[var][0]): - print "DIMENSION TOO LARGE AT LINE", self.stat[self.pc] + print("DIMENSION TOO LARGE AT LINE %s" % self.stat[self.pc]) raise RuntimeError self.tables[var][dim1val-1][dim2val-1] = self.eval(value) # Change the current line number def goto(self,linenum): - if not self.prog.has_key(linenum): - print "UNDEFINED LINE NUMBER %d AT LINE %d" % (linenum, self.stat[self.pc]) + if not linenum in self.prog: + print("UNDEFINED LINE NUMBER %d AT LINE %d" % (linenum, self.stat[self.pc])) raise RuntimeError self.pc = self.stat.index(linenum) @@ -183,7 +184,7 @@ class BasicInterpreter: self.gosub = None # Gosub return point (if any) self.error = 0 # Indicates program error - self.stat = self.prog.keys() # Ordered list of all line numbers + self.stat = list(self.prog) # Ordered list of all line numbers self.stat.sort() self.pc = 0 # Current program counter @@ -198,7 +199,7 @@ class BasicInterpreter: while 1: line = self.stat[self.pc] instr = self.prog[line] - + op = instr[0] # END and STOP statements @@ -225,11 +226,11 @@ class BasicInterpreter: out += str(eval) sys.stdout.write(out) end = instr[2] - if not (end == ',' or end == ';'): + if not (end == ',' or end == ';'): sys.stdout.write("\n") if end == ',': sys.stdout.write(" "*(15-(len(out) % 15))) if end == ';': sys.stdout.write(" "*(3-(len(out) % 3))) - + # LET statement elif op == 'LET': target = instr[1] @@ -258,7 +259,7 @@ class BasicInterpreter: initval = instr[2] finval = instr[3] stepval = instr[4] - + # Check to see if this is a new loop if not self.loops or self.loops[-1][0] != self.pc: # Looks like a new loop. Make the initial assignment @@ -284,21 +285,21 @@ class BasicInterpreter: elif op == 'NEXT': if not self.loops: - print "NEXT WITHOUT FOR AT LINE",line + print("NEXT WITHOUT FOR AT LINE %s" % line) return - + nextvar = instr[1] self.pc = self.loops[-1][0] loopinst = self.prog[self.stat[self.pc]] forvar = loopinst[1] if nextvar != forvar: - print "NEXT DOESN'T MATCH FOR AT LINE", line + print("NEXT DOESN'T MATCH FOR AT LINE %s" % line) return continue elif op == 'GOSUB': newline = instr[1] if self.gosub: - print "ALREADY IN A SUBROUTINE AT LINE", line + print("ALREADY IN A SUBROUTINE AT LINE %s" % line) return self.gosub = self.stat[self.pc] self.goto(newline) @@ -306,7 +307,7 @@ class BasicInterpreter: elif op == 'RETURN': if not self.gosub: - print "RETURN WITHOUT A GOSUB AT LINE",line + print("RETURN WITHOUT A GOSUB AT LINE %s" % line) return self.goto(self.gosub) self.gosub = None @@ -333,7 +334,7 @@ class BasicInterpreter: v.append(temp[:]) self.tables[vname] = v - self.pc += 1 + self.pc += 1 # Utility functions for program listing def expr_str(self,expr): @@ -358,74 +359,74 @@ class BasicInterpreter: # Create a program listing def list(self): - stat = self.prog.keys() # Ordered list of all line numbers + stat = list(self.prog) # Ordered list of all line numbers stat.sort() for line in stat: instr = self.prog[line] op = instr[0] if op in ['END','STOP','RETURN']: - print line, op + print("%s %s" % (line, op)) continue elif op == 'REM': - print line, instr[1] + print("%s %s" % (line, instr[1])) elif op == 'PRINT': - print line, op, + _out = "%s %s " % (line, op) first = 1 for p in instr[1]: - if not first: print ",", - if p[0] and p[1]: print '"%s"%s' % (p[0],self.expr_str(p[1])), - elif p[1]: print self.expr_str(p[1]), - else: print '"%s"' % (p[0],), + if not first: _out += ", " + if p[0] and p[1]: _out += '"%s"%s' % (p[0],self.expr_str(p[1])) + elif p[1]: _out += self.expr_str(p[1]) + else: _out += '"%s"' % (p[0],) first = 0 - if instr[2]: print instr[2] - else: print + if instr[2]: _out += instr[2] + print(_out) elif op == 'LET': - print line,"LET",self.var_str(instr[1]),"=",self.expr_str(instr[2]) + print("%s LET %s = %s" % (line,self.var_str(instr[1]),self.expr_str(instr[2]))) elif op == 'READ': - print line,"READ", + _out = "%s READ " % line first = 1 for r in instr[1]: - if not first: print ",", - print self.var_str(r), + if not first: _out += "," + _out += self.var_str(r) first = 0 - print "" + print(_out) elif op == 'IF': - print line,"IF %s THEN %d" % (self.relexpr_str(instr[1]),instr[2]) + print("%s IF %s THEN %d" % (line,self.relexpr_str(instr[1]),instr[2])) elif op == 'GOTO' or op == 'GOSUB': - print line, op, instr[1] + print("%s %s %s" % (line, op, instr[1])) elif op == 'FOR': - print line,"FOR %s = %s TO %s" % (instr[1],self.expr_str(instr[2]),self.expr_str(instr[3])), - if instr[4]: print "STEP %s" % (self.expr_str(instr[4])), - print + _out = "%s FOR %s = %s TO %s" % (line,instr[1],self.expr_str(instr[2]),self.expr_str(instr[3])) + if instr[4]: _out += " STEP %s" % (self.expr_str(instr[4])) + print(_out) elif op == 'NEXT': - print line,"NEXT", instr[1] + print("%s NEXT %s" % (line, instr[1])) elif op == 'FUNC': - print line,"DEF %s(%s) = %s" % (instr[1],instr[2],self.expr_str(instr[3])) + print("%s DEF %s(%s) = %s" % (line,instr[1],instr[2],self.expr_str(instr[3]))) elif op == 'DIM': - print line,"DIM", + _out = "%s DIM " % line first = 1 for vname,x,y in instr[1]: - if not first: print ",", + if not first: _out += "," first = 0 if y == 0: - print "%s(%d)" % (vname,x), + _out += "%s(%d)" % (vname,x) else: - print "%s(%d,%d)" % (vname,x,y), - - print + _out += "%s(%d,%d)" % (vname,x,y) + + print(_out) elif op == 'DATA': - print line,"DATA", + _out = "%s DATA " % line first = 1 for v in instr[1]: - if not first: print ",", + if not first: _out += "," first = 0 - print v, - print + _out += v + print(_out) # Erase the current program def new(self): self.prog = {} - + # Insert statements def add_statements(self,prog): for line,stat in prog.items(): diff --git a/ext/ply/example/BASIC/basparse.py b/ext/ply/example/BASIC/basparse.py index 930af9a22..ccdeb16b6 100644 --- a/ext/ply/example/BASIC/basparse.py +++ b/ext/ply/example/BASIC/basparse.py @@ -39,12 +39,12 @@ def p_program_error(p): p[0] = None p.parser.error = 1 -#### Format of all BASIC statements. +#### Format of all BASIC statements. def p_statement(p): '''statement : INTEGER command NEWLINE''' if isinstance(p[2],str): - print p[2],"AT LINE", p[1] + print("%s %s %s" % (p[2],"AT LINE", p[1])) p[0] = None p.parser.error = 1 else: @@ -68,7 +68,7 @@ def p_statement_blank(p): def p_statement_bad(p): '''statement : INTEGER error NEWLINE''' - print "MALFORMED STATEMENT AT LINE", p[1] + print("MALFORMED STATEMENT AT LINE %s" % p[1]) p[0] = None p.parser.error = 1 @@ -121,7 +121,7 @@ def p_command_print_bad(p): #### Optional ending on PRINT. Either a comma (,) or semicolon (;) def p_optend(p): - '''optend : COMMA + '''optend : COMMA | SEMI |''' if len(p) == 2: @@ -188,7 +188,7 @@ def p_optstep(p): p[0] = None #### NEXT statement - + def p_command_next(p): '''command : NEXT ID''' @@ -392,30 +392,30 @@ def p_item_expr(p): p[0] = ("",p[1]) #### Empty - + def p_empty(p): '''empty : ''' #### Catastrophic error handler def p_error(p): if not p: - print "SYNTAX ERROR AT EOF" + print("SYNTAX ERROR AT EOF") bparser = yacc.yacc() -def parse(data): +def parse(data,debug=0): bparser.error = 0 - p = bparser.parse(data) + p = bparser.parse(data,debug=debug) if bparser.error: return None return p - - - - + + + + diff --git a/ext/ply/example/GardenSnake/GardenSnake.py b/ext/ply/example/GardenSnake/GardenSnake.py index ffa550fc6..2a7f45eb1 100644 --- a/ext/ply/example/GardenSnake/GardenSnake.py +++ b/ext/ply/example/GardenSnake/GardenSnake.py @@ -180,7 +180,7 @@ def track_tokens_filter(lexer, tokens): at_line_start = False indent = MAY_INDENT token.must_indent = False - + elif token.type == "NEWLINE": at_line_start = True if indent == MAY_INDENT: @@ -235,7 +235,7 @@ def indentation_filter(tokens): ## if token.must_indent: ## print "must_indent", ## print - + # WS only occurs at the start of the line # There may be WS followed by NEWLINE so # only track the depth here. Don't indent/dedent @@ -294,7 +294,7 @@ def indentation_filter(tokens): assert token is not None for _ in range(1, len(levels)): yield DEDENT(token.lineno) - + # The top-level filter adds an ENDMARKER, if requested. # Python's grammar uses it. @@ -376,14 +376,14 @@ def p_file_input(p): p[0] = p[1] + p[2] else: p[0] = p[1] - + # funcdef: [decorators] 'def' NAME parameters ':' suite # ignoring decorators def p_funcdef(p): "funcdef : DEF NAME parameters COLON suite" p[0] = ast.Function(None, p[2], tuple(p[3]), (), 0, None, p[5]) - + # parameters: '(' [varargslist] ')' def p_parameters(p): """parameters : LPAR RPAR @@ -392,9 +392,9 @@ def p_parameters(p): p[0] = [] else: p[0] = p[2] + - -# varargslist: (fpdef ['=' test] ',')* ('*' NAME [',' '**' NAME] | '**' NAME) | +# varargslist: (fpdef ['=' test] ',')* ('*' NAME [',' '**' NAME] | '**' NAME) | # highly simplified def p_varargslist(p): """varargslist : varargslist COMMA NAME @@ -409,7 +409,7 @@ def p_stmt_simple(p): """stmt : simple_stmt""" # simple_stmt is a list p[0] = p[1] - + def p_stmt_compound(p): """stmt : compound_stmt""" p[0] = [p[1]] @@ -474,7 +474,7 @@ def p_suite(p): p[0] = ast.Stmt(p[1]) else: p[0] = ast.Stmt(p[3]) - + def p_stmts(p): """stmts : stmts stmt @@ -536,7 +536,7 @@ def p_comparison(p): p[0] = unary_ops[p[1]](p[2]) else: p[0] = p[1] - + # power: atom trailer* ['**' factor] # trailers enables function calls. I only allow one level of calls # so this is 'trailer' @@ -605,7 +605,7 @@ def p_testlist_multi(p): def p_test(p): "test : comparison" p[0] = p[1] - + # arglist: (argument ',')* (argument [',']| '*' test [',' '**' test] | '**' test) @@ -642,7 +642,7 @@ class GardenSnakeParser(object): ###### Code generation ###### - + from compiler import misc, syntax, pycodegen class GardenSnakeCompiler(object): @@ -658,13 +658,13 @@ class GardenSnakeCompiler(object): return code ####### Test code ####### - + compile = GardenSnakeCompiler().compile code = r""" print('LET\'S TRY THIS \\OUT') - + #Comment here def x(a): print('called with',a) diff --git a/ext/ply/example/ansic/clex.py b/ext/ply/example/ansic/clex.py index 12441a60b..37fdd8e66 100644 --- a/ext/ply/example/ansic/clex.py +++ b/ext/ply/example/ansic/clex.py @@ -26,7 +26,7 @@ tokens = reserved + ( 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT', 'LOR', 'LAND', 'LNOT', 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE', - + # Assignment (=, *=, /=, %=, +=, -=, <<=, >>=, &=, ^=, |=) 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', 'PLUSEQUAL', 'MINUSEQUAL', 'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL', 'OREQUAL', @@ -39,7 +39,7 @@ tokens = reserved + ( # Conditional operator (?) 'CONDOP', - + # Delimeters ( ) [ ] { } , . ; : 'LPAREN', 'RPAREN', 'LBRACKET', 'RBRACKET', @@ -57,7 +57,7 @@ t_ignore = ' \t\x0c' def t_NEWLINE(t): r'\n+' t.lexer.lineno += t.value.count("\n") - + # Operators t_PLUS = r'\+' t_MINUS = r'-' @@ -142,23 +142,23 @@ t_CCONST = r'(L)?\'([^\\\n]|(\\.))*?\'' # Comments def t_comment(t): - r' /\*(.|\n)*?\*/' - t.lineno += t.value.count('\n') + r'/\*(.|\n)*?\*/' + t.lexer.lineno += t.value.count('\n') # Preprocessor directive (ignored) def t_preprocessor(t): r'\#(.)*?\n' - t.lineno += 1 - + t.lexer.lineno += 1 + def t_error(t): - print "Illegal character %s" % repr(t.value[0]) + print("Illegal character %s" % repr(t.value[0])) t.lexer.skip(1) - + lexer = lex.lex(optimize=1) if __name__ == "__main__": lex.runmain(lexer) - + diff --git a/ext/ply/example/ansic/cparse.py b/ext/ply/example/ansic/cparse.py index d474378c8..c9b916455 100644 --- a/ext/ply/example/ansic/cparse.py +++ b/ext/ply/example/ansic/cparse.py @@ -155,7 +155,7 @@ def p_struct_declaration_list_1(t): pass def p_struct_declaration_list_2(t): - 'struct_declaration_list : struct_declarator_list struct_declaration' + 'struct_declaration_list : struct_declaration_list struct_declaration' pass # init-declarator-list: @@ -778,12 +778,12 @@ def p_unary_expression_5(t): def p_unary_expression_6(t): 'unary_expression : SIZEOF LPAREN type_name RPAREN' pass - + #unary-operator def p_unary_operator(t): '''unary_operator : AND | TIMES - | PLUS + | PLUS | MINUS | NOT | LNOT ''' @@ -837,7 +837,7 @@ def p_argument_expression_list(t): pass # constant: -def p_constant(t): +def p_constant(t): '''constant : ICONST | FCONST | CCONST''' @@ -849,7 +849,7 @@ def p_empty(t): pass def p_error(t): - print "Whoa. We're hosed" + print("Whoa. We're hosed") import profile # Build the grammar diff --git a/ext/ply/example/calc/calc.py b/ext/ply/example/calc/calc.py index 987ce8019..b92378043 100644 --- a/ext/ply/example/calc/calc.py +++ b/ext/ply/example/calc/calc.py @@ -8,6 +8,9 @@ import sys sys.path.insert(0,"../..") +if sys.version_info[0] >= 3: + raw_input = input + tokens = ( 'NAME','NUMBER', ) @@ -20,11 +23,7 @@ t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' def t_NUMBER(t): r'\d+' - try: - t.value = int(t.value) - except ValueError: - print "Integer value too large", t.value - t.value = 0 + t.value = int(t.value) return t t_ignore = " \t" @@ -32,11 +31,11 @@ t_ignore = " \t" def t_newline(t): r'\n+' t.lexer.lineno += t.value.count("\n") - + def t_error(t): - print "Illegal character '%s'" % t.value[0] + print("Illegal character '%s'" % t.value[0]) t.lexer.skip(1) - + # Build the lexer import ply.lex as lex lex.lex() @@ -58,7 +57,7 @@ def p_statement_assign(p): def p_statement_expr(p): 'statement : expression' - print p[1] + print(p[1]) def p_expression_binop(p): '''expression : expression '+' expression @@ -87,11 +86,14 @@ def p_expression_name(p): try: p[0] = names[p[1]] except LookupError: - print "Undefined name '%s'" % p[1] + print("Undefined name '%s'" % p[1]) p[0] = 0 def p_error(p): - print "Syntax error at '%s'" % p.value + if p: + print("Syntax error at '%s'" % p.value) + else: + print("Syntax error at EOF") import ply.yacc as yacc yacc.yacc() diff --git a/ext/ply/example/calcdebug/calc.py b/ext/ply/example/calcdebug/calc.py new file mode 100644 index 000000000..6732f9f32 --- /dev/null +++ b/ext/ply/example/calcdebug/calc.py @@ -0,0 +1,113 @@ +# ----------------------------------------------------------------------------- +# calc.py +# +# This example shows how to run the parser in a debugging mode +# with output routed to a logging object. +# ----------------------------------------------------------------------------- + +import sys +sys.path.insert(0,"../..") + +if sys.version_info[0] >= 3: + raw_input = input + +tokens = ( + 'NAME','NUMBER', + ) + +literals = ['=','+','-','*','/', '(',')'] + +# Tokens + +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + +t_ignore = " \t" + +def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +import ply.lex as lex +lex.lex() + +# Parsing rules + +precedence = ( + ('left','+','-'), + ('left','*','/'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(p): + 'statement : NAME "=" expression' + names[p[1]] = p[3] + +def p_statement_expr(p): + 'statement : expression' + print(p[1]) + +def p_expression_binop(p): + '''expression : expression '+' expression + | expression '-' expression + | expression '*' expression + | expression '/' expression''' + if p[2] == '+' : p[0] = p[1] + p[3] + elif p[2] == '-': p[0] = p[1] - p[3] + elif p[2] == '*': p[0] = p[1] * p[3] + elif p[2] == '/': p[0] = p[1] / p[3] + +def p_expression_uminus(p): + "expression : '-' expression %prec UMINUS" + p[0] = -p[2] + +def p_expression_group(p): + "expression : '(' expression ')'" + p[0] = p[2] + +def p_expression_number(p): + "expression : NUMBER" + p[0] = p[1] + +def p_expression_name(p): + "expression : NAME" + try: + p[0] = names[p[1]] + except LookupError: + print("Undefined name '%s'" % p[1]) + p[0] = 0 + +def p_error(p): + if p: + print("Syntax error at '%s'" % p.value) + else: + print("Syntax error at EOF") + +import ply.yacc as yacc +yacc.yacc() + +import logging +logging.basicConfig( + level=logging.INFO, + filename="parselog.txt" +) + +while 1: + try: + s = raw_input('calc > ') + except EOFError: + break + if not s: continue + yacc.parse(s,debug=logging.getLogger()) diff --git a/ext/ply/example/classcalc/calc.py b/ext/ply/example/classcalc/calc.py index b2f3f70f1..bf0d065e4 100644..100755 --- a/ext/ply/example/classcalc/calc.py +++ b/ext/ply/example/classcalc/calc.py @@ -12,7 +12,9 @@ import sys sys.path.insert(0,"../..") -import readline +if sys.version_info[0] >= 3: + raw_input = input + import ply.lex as lex import ply.yacc as yacc import os @@ -51,7 +53,7 @@ class Parser: if not s: continue yacc.parse(s) - + class Calc(Parser): tokens = ( @@ -77,7 +79,7 @@ class Calc(Parser): try: t.value = int(t.value) except ValueError: - print "Integer value too large", t.value + print("Integer value too large %s" % t.value) t.value = 0 #print "parsed number %s" % repr(t.value) return t @@ -87,9 +89,9 @@ class Calc(Parser): def t_newline(self, t): r'\n+' t.lexer.lineno += t.value.count("\n") - + def t_error(self, t): - print "Illegal character '%s'" % t.value[0] + print("Illegal character '%s'" % t.value[0]) t.lexer.skip(1) # Parsing rules @@ -107,7 +109,7 @@ class Calc(Parser): def p_statement_expr(self, p): 'statement : expression' - print p[1] + print(p[1]) def p_expression_binop(self, p): """ @@ -141,11 +143,14 @@ class Calc(Parser): try: p[0] = self.names[p[1]] except LookupError: - print "Undefined name '%s'" % p[1] + print("Undefined name '%s'" % p[1]) p[0] = 0 def p_error(self, p): - print "Syntax error at '%s'" % p.value + if p: + print("Syntax error at '%s'" % p.value) + else: + print("Syntax error at EOF") if __name__ == '__main__': calc = Calc() diff --git a/ext/ply/example/cleanup.sh b/ext/ply/example/cleanup.sh index 3e115f41c..3e115f41c 100644..100755 --- a/ext/ply/example/cleanup.sh +++ b/ext/ply/example/cleanup.sh diff --git a/ext/ply/example/closurecalc/calc.py b/ext/ply/example/closurecalc/calc.py new file mode 100644 index 000000000..6598f5844 --- /dev/null +++ b/ext/ply/example/closurecalc/calc.py @@ -0,0 +1,130 @@ +# ----------------------------------------------------------------------------- +# calc.py +# +# A calculator parser that makes use of closures. The function make_calculator() +# returns a function that accepts an input string and returns a result. All +# lexing rules, parsing rules, and internal state are held inside the function. +# ----------------------------------------------------------------------------- + +import sys +sys.path.insert(0,"../..") + +if sys.version_info[0] >= 3: + raw_input = input + +# Make a calculator function + +def make_calculator(): + import ply.lex as lex + import ply.yacc as yacc + + # ------- Internal calculator state + + variables = { } # Dictionary of stored variables + + # ------- Calculator tokenizing rules + + tokens = ( + 'NAME','NUMBER', + ) + + literals = ['=','+','-','*','/', '(',')'] + + t_ignore = " \t" + + t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + + def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + # Build the lexer + lexer = lex.lex() + + # ------- Calculator parsing rules + + precedence = ( + ('left','+','-'), + ('left','*','/'), + ('right','UMINUS'), + ) + + def p_statement_assign(p): + 'statement : NAME "=" expression' + variables[p[1]] = p[3] + p[0] = None + + def p_statement_expr(p): + 'statement : expression' + p[0] = p[1] + + def p_expression_binop(p): + '''expression : expression '+' expression + | expression '-' expression + | expression '*' expression + | expression '/' expression''' + if p[2] == '+' : p[0] = p[1] + p[3] + elif p[2] == '-': p[0] = p[1] - p[3] + elif p[2] == '*': p[0] = p[1] * p[3] + elif p[2] == '/': p[0] = p[1] / p[3] + + def p_expression_uminus(p): + "expression : '-' expression %prec UMINUS" + p[0] = -p[2] + + def p_expression_group(p): + "expression : '(' expression ')'" + p[0] = p[2] + + def p_expression_number(p): + "expression : NUMBER" + p[0] = p[1] + + def p_expression_name(p): + "expression : NAME" + try: + p[0] = variables[p[1]] + except LookupError: + print("Undefined name '%s'" % p[1]) + p[0] = 0 + + def p_error(p): + if p: + print("Syntax error at '%s'" % p.value) + else: + print("Syntax error at EOF") + + + # Build the parser + parser = yacc.yacc() + + # ------- Input function + + def input(text): + result = parser.parse(text,lexer=lexer) + return result + + return input + +# Make a calculator object and use it +calc = make_calculator() + +while True: + try: + s = raw_input("calc > ") + except EOFError: + break + r = calc(s) + if r: + print(r) + + diff --git a/ext/ply/example/hedit/hedit.py b/ext/ply/example/hedit/hedit.py index 494f4fde5..2e80675f9 100644 --- a/ext/ply/example/hedit/hedit.py +++ b/ext/ply/example/hedit/hedit.py @@ -29,17 +29,17 @@ def t_H_EDIT_DESCRIPTOR(t): r"\d+H.*" # This grabs all of the remaining text i = t.value.index('H') n = eval(t.value[:i]) - + # Adjust the tokenizing position t.lexer.lexpos -= len(t.value) - (i+1+n) - + t.value = t.value[i+1:i+1+n] - return t - + return t + def t_error(t): - print "Illegal character '%s'" % t.value[0] + print("Illegal character '%s'" % t.value[0]) t.lexer.skip(1) - + # Build the lexer import ply.lex as lex lex.lex() diff --git a/ext/ply/example/newclasscalc/calc.py b/ext/ply/example/newclasscalc/calc.py index 7f29bc821..a12e498b2 100644..100755 --- a/ext/ply/example/newclasscalc/calc.py +++ b/ext/ply/example/newclasscalc/calc.py @@ -14,7 +14,9 @@ import sys sys.path.insert(0,"../..") -import readline +if sys.version_info[0] >= 3: + raw_input = input + import ply.lex as lex import ply.yacc as yacc import os @@ -51,10 +53,10 @@ class Parser(object): s = raw_input('calc > ') except EOFError: break - if not s: continue + if not s: continue yacc.parse(s) - + class Calc(Parser): tokens = ( @@ -80,7 +82,7 @@ class Calc(Parser): try: t.value = int(t.value) except ValueError: - print "Integer value too large", t.value + print("Integer value too large %s" % t.value) t.value = 0 #print "parsed number %s" % repr(t.value) return t @@ -90,9 +92,9 @@ class Calc(Parser): def t_newline(self, t): r'\n+' t.lexer.lineno += t.value.count("\n") - + def t_error(self, t): - print "Illegal character '%s'" % t.value[0] + print("Illegal character '%s'" % t.value[0]) t.lexer.skip(1) # Parsing rules @@ -110,7 +112,7 @@ class Calc(Parser): def p_statement_expr(self, p): 'statement : expression' - print p[1] + print(p[1]) def p_expression_binop(self, p): """ @@ -144,11 +146,14 @@ class Calc(Parser): try: p[0] = self.names[p[1]] except LookupError: - print "Undefined name '%s'" % p[1] + print("Undefined name '%s'" % p[1]) p[0] = 0 def p_error(self, p): - print "Syntax error at '%s'" % p.value + if p: + print("Syntax error at '%s'" % p.value) + else: + print("Syntax error at EOF") if __name__ == '__main__': calc = Calc() diff --git a/ext/ply/example/optcalc/README b/ext/ply/example/optcalc/README index 6d196f0ee..53dd5fcd5 100644 --- a/ext/ply/example/optcalc/README +++ b/ext/ply/example/optcalc/README @@ -5,5 +5,5 @@ To run: - Then run 'python -OO calc.py' -If working corretly, the second version should run the +If working correctly, the second version should run the same way. diff --git a/ext/ply/example/optcalc/calc.py b/ext/ply/example/optcalc/calc.py index 3a0ee6c9b..dd83351a0 100644 --- a/ext/ply/example/optcalc/calc.py +++ b/ext/ply/example/optcalc/calc.py @@ -8,6 +8,9 @@ import sys sys.path.insert(0,"../..") +if sys.version_info[0] >= 3: + raw_input = input + tokens = ( 'NAME','NUMBER', 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', @@ -30,7 +33,7 @@ def t_NUMBER(t): try: t.value = int(t.value) except ValueError: - print "Integer value too large", t.value + print("Integer value too large %s" % t.value) t.value = 0 return t @@ -39,11 +42,11 @@ t_ignore = " \t" def t_newline(t): r'\n+' t.lexer.lineno += t.value.count("\n") - + def t_error(t): - print "Illegal character '%s'" % t.value[0] + print("Illegal character '%s'" % t.value[0]) t.lexer.skip(1) - + # Build the lexer import ply.lex as lex lex.lex(optimize=1) @@ -65,7 +68,7 @@ def p_statement_assign(t): def p_statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -95,11 +98,14 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(t): - print "Syntax error at '%s'" % t.value + if t: + print("Syntax error at '%s'" % t.value) + else: + print("Syntax error at EOF") import ply.yacc as yacc yacc.yacc(optimize=1) diff --git a/ext/ply/example/unicalc/calc.py b/ext/ply/example/unicalc/calc.py index d1f59f748..55fb48df4 100644 --- a/ext/ply/example/unicalc/calc.py +++ b/ext/ply/example/unicalc/calc.py @@ -41,11 +41,11 @@ t_ignore = u" \t" def t_newline(t): ur'\n+' t.lexer.lineno += t.value.count("\n") - + def t_error(t): print "Illegal character '%s'" % t.value[0] t.lexer.skip(1) - + # Build the lexer import ply.lex as lex lex.lex() @@ -100,7 +100,10 @@ def p_expression_name(p): p[0] = 0 def p_error(p): - print "Syntax error at '%s'" % p.value + if p: + print "Syntax error at '%s'" % p.value + else: + print "Syntax error at EOF" import ply.yacc as yacc yacc.yacc() diff --git a/ext/ply/example/yply/ylex.py b/ext/ply/example/yply/ylex.py index 61bc0c7ef..84f2f7a73 100644 --- a/ext/ply/example/yply/ylex.py +++ b/ext/ply/example/yply/ylex.py @@ -42,7 +42,7 @@ def t_SECTION(t): # Comments def t_ccomment(t): r'/\*(.|\n)*?\*/' - t.lineno += t.value.count('\n') + t.lexer.lineno += t.value.count('\n') t_ignore_cppcomment = r'//.*' @@ -95,7 +95,7 @@ def t_code_error(t): raise RuntimeError def t_error(t): - print "%d: Illegal character '%s'" % (t.lineno, t.value[0]) + print "%d: Illegal character '%s'" % (t.lexer.lineno, t.value[0]) print t.value t.lexer.skip(1) @@ -104,9 +104,9 @@ lex.lex() if __name__ == '__main__': lex.runmain() + + + - - - - + diff --git a/ext/ply/example/yply/yparse.py b/ext/ply/example/yply/yparse.py index a4e46bef7..ab5b88451 100644 --- a/ext/ply/example/yply/yparse.py +++ b/ext/ply/example/yply/yparse.py @@ -25,7 +25,7 @@ def p_defsection(p): print "precedence = ", repr(preclist) print print "# -------------- RULES ----------------" - print + print def p_rulesection(p): '''rulesection : rules SECTION''' @@ -78,12 +78,12 @@ def p_idlist(p): p[1].append(p[3]) def p_tokenid(p): - '''tokenid : ID + '''tokenid : ID | ID NUMBER | QLITERAL | QLITERAL NUMBER''' p[0] = p[1] - + def p_optsemi(p): '''optsemi : ';' | empty''' @@ -165,7 +165,7 @@ def p_rule_empty(p): def p_rule_empty2(p): '''rule : ID ':' morerules ';' ''' - + p[3].insert(0,[]) p[0] = (p[1],p[3]) @@ -173,10 +173,10 @@ def p_morerules(p): '''morerules : morerules '|' rulelist | '|' rulelist | '|' ''' - - if len(p) == 2: + + if len(p) == 2: p[0] = [[]] - elif len(p) == 3: + elif len(p) == 3: p[0] = [p[2]] else: p[0] = p[1] diff --git a/ext/ply/example/yply/yply.py b/ext/ply/example/yply/yply.py index a4398171e..a4398171e 100644..100755 --- a/ext/ply/example/yply/yply.py +++ b/ext/ply/example/yply/yply.py diff --git a/ext/ply/ply/cpp.py b/ext/ply/ply/cpp.py new file mode 100644 index 000000000..39f9d47f3 --- /dev/null +++ b/ext/ply/ply/cpp.py @@ -0,0 +1,898 @@ +# ----------------------------------------------------------------------------- +# cpp.py +# +# Author: David Beazley (http://www.dabeaz.com) +# Copyright (C) 2007 +# All rights reserved +# +# This module implements an ANSI-C style lexical preprocessor for PLY. +# ----------------------------------------------------------------------------- +from __future__ import generators + +# ----------------------------------------------------------------------------- +# Default preprocessor lexer definitions. These tokens are enough to get +# a basic preprocessor working. Other modules may import these if they want +# ----------------------------------------------------------------------------- + +tokens = ( + 'CPP_ID','CPP_INTEGER', 'CPP_FLOAT', 'CPP_STRING', 'CPP_CHAR', 'CPP_WS', 'CPP_COMMENT', 'CPP_POUND','CPP_DPOUND' +) + +literals = "+-*/%|&~^<>=!?()[]{}.,;:\\\'\"" + +# Whitespace +def t_CPP_WS(t): + r'\s+' + t.lexer.lineno += t.value.count("\n") + return t + +t_CPP_POUND = r'\#' +t_CPP_DPOUND = r'\#\#' + +# Identifier +t_CPP_ID = r'[A-Za-z_][\w_]*' + +# Integer literal +def CPP_INTEGER(t): + r'(((((0x)|(0X))[0-9a-fA-F]+)|(\d+))([uU]|[lL]|[uU][lL]|[lL][uU])?)' + return t + +t_CPP_INTEGER = CPP_INTEGER + +# Floating literal +t_CPP_FLOAT = r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?' + +# String literal +def t_CPP_STRING(t): + r'\"([^\\\n]|(\\(.|\n)))*?\"' + t.lexer.lineno += t.value.count("\n") + return t + +# Character constant 'c' or L'c' +def t_CPP_CHAR(t): + r'(L)?\'([^\\\n]|(\\(.|\n)))*?\'' + t.lexer.lineno += t.value.count("\n") + return t + +# Comment +def t_CPP_COMMENT(t): + r'(/\*(.|\n)*?\*/)|(//.*?\n)' + t.lexer.lineno += t.value.count("\n") + return t + +def t_error(t): + t.type = t.value[0] + t.value = t.value[0] + t.lexer.skip(1) + return t + +import re +import copy +import time +import os.path + +# ----------------------------------------------------------------------------- +# trigraph() +# +# Given an input string, this function replaces all trigraph sequences. +# The following mapping is used: +# +# ??= # +# ??/ \ +# ??' ^ +# ??( [ +# ??) ] +# ??! | +# ??< { +# ??> } +# ??- ~ +# ----------------------------------------------------------------------------- + +_trigraph_pat = re.compile(r'''\?\?[=/\'\(\)\!<>\-]''') +_trigraph_rep = { + '=':'#', + '/':'\\', + "'":'^', + '(':'[', + ')':']', + '!':'|', + '<':'{', + '>':'}', + '-':'~' +} + +def trigraph(input): + return _trigraph_pat.sub(lambda g: _trigraph_rep[g.group()[-1]],input) + +# ------------------------------------------------------------------ +# Macro object +# +# This object holds information about preprocessor macros +# +# .name - Macro name (string) +# .value - Macro value (a list of tokens) +# .arglist - List of argument names +# .variadic - Boolean indicating whether or not variadic macro +# .vararg - Name of the variadic parameter +# +# When a macro is created, the macro replacement token sequence is +# pre-scanned and used to create patch lists that are later used +# during macro expansion +# ------------------------------------------------------------------ + +class Macro(object): + def __init__(self,name,value,arglist=None,variadic=False): + self.name = name + self.value = value + self.arglist = arglist + self.variadic = variadic + if variadic: + self.vararg = arglist[-1] + self.source = None + +# ------------------------------------------------------------------ +# Preprocessor object +# +# Object representing a preprocessor. Contains macro definitions, +# include directories, and other information +# ------------------------------------------------------------------ + +class Preprocessor(object): + def __init__(self,lexer=None): + if lexer is None: + lexer = lex.lexer + self.lexer = lexer + self.macros = { } + self.path = [] + self.temp_path = [] + + # Probe the lexer for selected tokens + self.lexprobe() + + tm = time.localtime() + self.define("__DATE__ \"%s\"" % time.strftime("%b %d %Y",tm)) + self.define("__TIME__ \"%s\"" % time.strftime("%H:%M:%S",tm)) + self.parser = None + + # ----------------------------------------------------------------------------- + # tokenize() + # + # Utility function. Given a string of text, tokenize into a list of tokens + # ----------------------------------------------------------------------------- + + def tokenize(self,text): + tokens = [] + self.lexer.input(text) + while True: + tok = self.lexer.token() + if not tok: break + tokens.append(tok) + return tokens + + # --------------------------------------------------------------------- + # error() + # + # Report a preprocessor error/warning of some kind + # ---------------------------------------------------------------------- + + def error(self,file,line,msg): + print >>sys.stderr,"%s:%d %s" % (file,line,msg) + + # ---------------------------------------------------------------------- + # lexprobe() + # + # This method probes the preprocessor lexer object to discover + # the token types of symbols that are important to the preprocessor. + # If this works right, the preprocessor will simply "work" + # with any suitable lexer regardless of how tokens have been named. + # ---------------------------------------------------------------------- + + def lexprobe(self): + + # Determine the token type for identifiers + self.lexer.input("identifier") + tok = self.lexer.token() + if not tok or tok.value != "identifier": + print "Couldn't determine identifier type" + else: + self.t_ID = tok.type + + # Determine the token type for integers + self.lexer.input("12345") + tok = self.lexer.token() + if not tok or int(tok.value) != 12345: + print "Couldn't determine integer type" + else: + self.t_INTEGER = tok.type + self.t_INTEGER_TYPE = type(tok.value) + + # Determine the token type for strings enclosed in double quotes + self.lexer.input("\"filename\"") + tok = self.lexer.token() + if not tok or tok.value != "\"filename\"": + print "Couldn't determine string type" + else: + self.t_STRING = tok.type + + # Determine the token type for whitespace--if any + self.lexer.input(" ") + tok = self.lexer.token() + if not tok or tok.value != " ": + self.t_SPACE = None + else: + self.t_SPACE = tok.type + + # Determine the token type for newlines + self.lexer.input("\n") + tok = self.lexer.token() + if not tok or tok.value != "\n": + self.t_NEWLINE = None + print "Couldn't determine token for newlines" + else: + self.t_NEWLINE = tok.type + + self.t_WS = (self.t_SPACE, self.t_NEWLINE) + + # Check for other characters used by the preprocessor + chars = [ '<','>','#','##','\\','(',')',',','.'] + for c in chars: + self.lexer.input(c) + tok = self.lexer.token() + if not tok or tok.value != c: + print "Unable to lex '%s' required for preprocessor" % c + + # ---------------------------------------------------------------------- + # add_path() + # + # Adds a search path to the preprocessor. + # ---------------------------------------------------------------------- + + def add_path(self,path): + self.path.append(path) + + # ---------------------------------------------------------------------- + # group_lines() + # + # Given an input string, this function splits it into lines. Trailing whitespace + # is removed. Any line ending with \ is grouped with the next line. This + # function forms the lowest level of the preprocessor---grouping into text into + # a line-by-line format. + # ---------------------------------------------------------------------- + + def group_lines(self,input): + lex = self.lexer.clone() + lines = [x.rstrip() for x in input.splitlines()] + for i in xrange(len(lines)): + j = i+1 + while lines[i].endswith('\\') and (j < len(lines)): + lines[i] = lines[i][:-1]+lines[j] + lines[j] = "" + j += 1 + + input = "\n".join(lines) + lex.input(input) + lex.lineno = 1 + + current_line = [] + while True: + tok = lex.token() + if not tok: + break + current_line.append(tok) + if tok.type in self.t_WS and '\n' in tok.value: + yield current_line + current_line = [] + + if current_line: + yield current_line + + # ---------------------------------------------------------------------- + # tokenstrip() + # + # Remove leading/trailing whitespace tokens from a token list + # ---------------------------------------------------------------------- + + def tokenstrip(self,tokens): + i = 0 + while i < len(tokens) and tokens[i].type in self.t_WS: + i += 1 + del tokens[:i] + i = len(tokens)-1 + while i >= 0 and tokens[i].type in self.t_WS: + i -= 1 + del tokens[i+1:] + return tokens + + + # ---------------------------------------------------------------------- + # collect_args() + # + # Collects comma separated arguments from a list of tokens. The arguments + # must be enclosed in parenthesis. Returns a tuple (tokencount,args,positions) + # where tokencount is the number of tokens consumed, args is a list of arguments, + # and positions is a list of integers containing the starting index of each + # argument. Each argument is represented by a list of tokens. + # + # When collecting arguments, leading and trailing whitespace is removed + # from each argument. + # + # This function properly handles nested parenthesis and commas---these do not + # define new arguments. + # ---------------------------------------------------------------------- + + def collect_args(self,tokenlist): + args = [] + positions = [] + current_arg = [] + nesting = 1 + tokenlen = len(tokenlist) + + # Search for the opening '('. + i = 0 + while (i < tokenlen) and (tokenlist[i].type in self.t_WS): + i += 1 + + if (i < tokenlen) and (tokenlist[i].value == '('): + positions.append(i+1) + else: + self.error(self.source,tokenlist[0].lineno,"Missing '(' in macro arguments") + return 0, [], [] + + i += 1 + + while i < tokenlen: + t = tokenlist[i] + if t.value == '(': + current_arg.append(t) + nesting += 1 + elif t.value == ')': + nesting -= 1 + if nesting == 0: + if current_arg: + args.append(self.tokenstrip(current_arg)) + positions.append(i) + return i+1,args,positions + current_arg.append(t) + elif t.value == ',' and nesting == 1: + args.append(self.tokenstrip(current_arg)) + positions.append(i+1) + current_arg = [] + else: + current_arg.append(t) + i += 1 + + # Missing end argument + self.error(self.source,tokenlist[-1].lineno,"Missing ')' in macro arguments") + return 0, [],[] + + # ---------------------------------------------------------------------- + # macro_prescan() + # + # Examine the macro value (token sequence) and identify patch points + # This is used to speed up macro expansion later on---we'll know + # right away where to apply patches to the value to form the expansion + # ---------------------------------------------------------------------- + + def macro_prescan(self,macro): + macro.patch = [] # Standard macro arguments + macro.str_patch = [] # String conversion expansion + macro.var_comma_patch = [] # Variadic macro comma patch + i = 0 + while i < len(macro.value): + if macro.value[i].type == self.t_ID and macro.value[i].value in macro.arglist: + argnum = macro.arglist.index(macro.value[i].value) + # Conversion of argument to a string + if i > 0 and macro.value[i-1].value == '#': + macro.value[i] = copy.copy(macro.value[i]) + macro.value[i].type = self.t_STRING + del macro.value[i-1] + macro.str_patch.append((argnum,i-1)) + continue + # Concatenation + elif (i > 0 and macro.value[i-1].value == '##'): + macro.patch.append(('c',argnum,i-1)) + del macro.value[i-1] + continue + elif ((i+1) < len(macro.value) and macro.value[i+1].value == '##'): + macro.patch.append(('c',argnum,i)) + i += 1 + continue + # Standard expansion + else: + macro.patch.append(('e',argnum,i)) + elif macro.value[i].value == '##': + if macro.variadic and (i > 0) and (macro.value[i-1].value == ',') and \ + ((i+1) < len(macro.value)) and (macro.value[i+1].type == self.t_ID) and \ + (macro.value[i+1].value == macro.vararg): + macro.var_comma_patch.append(i-1) + i += 1 + macro.patch.sort(key=lambda x: x[2],reverse=True) + + # ---------------------------------------------------------------------- + # macro_expand_args() + # + # Given a Macro and list of arguments (each a token list), this method + # returns an expanded version of a macro. The return value is a token sequence + # representing the replacement macro tokens + # ---------------------------------------------------------------------- + + def macro_expand_args(self,macro,args): + # Make a copy of the macro token sequence + rep = [copy.copy(_x) for _x in macro.value] + + # Make string expansion patches. These do not alter the length of the replacement sequence + + str_expansion = {} + for argnum, i in macro.str_patch: + if argnum not in str_expansion: + str_expansion[argnum] = ('"%s"' % "".join([x.value for x in args[argnum]])).replace("\\","\\\\") + rep[i] = copy.copy(rep[i]) + rep[i].value = str_expansion[argnum] + + # Make the variadic macro comma patch. If the variadic macro argument is empty, we get rid + comma_patch = False + if macro.variadic and not args[-1]: + for i in macro.var_comma_patch: + rep[i] = None + comma_patch = True + + # Make all other patches. The order of these matters. It is assumed that the patch list + # has been sorted in reverse order of patch location since replacements will cause the + # size of the replacement sequence to expand from the patch point. + + expanded = { } + for ptype, argnum, i in macro.patch: + # Concatenation. Argument is left unexpanded + if ptype == 'c': + rep[i:i+1] = args[argnum] + # Normal expansion. Argument is macro expanded first + elif ptype == 'e': + if argnum not in expanded: + expanded[argnum] = self.expand_macros(args[argnum]) + rep[i:i+1] = expanded[argnum] + + # Get rid of removed comma if necessary + if comma_patch: + rep = [_i for _i in rep if _i] + + return rep + + + # ---------------------------------------------------------------------- + # expand_macros() + # + # Given a list of tokens, this function performs macro expansion. + # The expanded argument is a dictionary that contains macros already + # expanded. This is used to prevent infinite recursion. + # ---------------------------------------------------------------------- + + def expand_macros(self,tokens,expanded=None): + if expanded is None: + expanded = {} + i = 0 + while i < len(tokens): + t = tokens[i] + if t.type == self.t_ID: + if t.value in self.macros and t.value not in expanded: + # Yes, we found a macro match + expanded[t.value] = True + + m = self.macros[t.value] + if not m.arglist: + # A simple macro + ex = self.expand_macros([copy.copy(_x) for _x in m.value],expanded) + for e in ex: + e.lineno = t.lineno + tokens[i:i+1] = ex + i += len(ex) + else: + # A macro with arguments + j = i + 1 + while j < len(tokens) and tokens[j].type in self.t_WS: + j += 1 + if tokens[j].value == '(': + tokcount,args,positions = self.collect_args(tokens[j:]) + if not m.variadic and len(args) != len(m.arglist): + self.error(self.source,t.lineno,"Macro %s requires %d arguments" % (t.value,len(m.arglist))) + i = j + tokcount + elif m.variadic and len(args) < len(m.arglist)-1: + if len(m.arglist) > 2: + self.error(self.source,t.lineno,"Macro %s must have at least %d arguments" % (t.value, len(m.arglist)-1)) + else: + self.error(self.source,t.lineno,"Macro %s must have at least %d argument" % (t.value, len(m.arglist)-1)) + i = j + tokcount + else: + if m.variadic: + if len(args) == len(m.arglist)-1: + args.append([]) + else: + args[len(m.arglist)-1] = tokens[j+positions[len(m.arglist)-1]:j+tokcount-1] + del args[len(m.arglist):] + + # Get macro replacement text + rep = self.macro_expand_args(m,args) + rep = self.expand_macros(rep,expanded) + for r in rep: + r.lineno = t.lineno + tokens[i:j+tokcount] = rep + i += len(rep) + del expanded[t.value] + continue + elif t.value == '__LINE__': + t.type = self.t_INTEGER + t.value = self.t_INTEGER_TYPE(t.lineno) + + i += 1 + return tokens + + # ---------------------------------------------------------------------- + # evalexpr() + # + # Evaluate an expression token sequence for the purposes of evaluating + # integral expressions. + # ---------------------------------------------------------------------- + + def evalexpr(self,tokens): + # tokens = tokenize(line) + # Search for defined macros + i = 0 + while i < len(tokens): + if tokens[i].type == self.t_ID and tokens[i].value == 'defined': + j = i + 1 + needparen = False + result = "0L" + while j < len(tokens): + if tokens[j].type in self.t_WS: + j += 1 + continue + elif tokens[j].type == self.t_ID: + if tokens[j].value in self.macros: + result = "1L" + else: + result = "0L" + if not needparen: break + elif tokens[j].value == '(': + needparen = True + elif tokens[j].value == ')': + break + else: + self.error(self.source,tokens[i].lineno,"Malformed defined()") + j += 1 + tokens[i].type = self.t_INTEGER + tokens[i].value = self.t_INTEGER_TYPE(result) + del tokens[i+1:j+1] + i += 1 + tokens = self.expand_macros(tokens) + for i,t in enumerate(tokens): + if t.type == self.t_ID: + tokens[i] = copy.copy(t) + tokens[i].type = self.t_INTEGER + tokens[i].value = self.t_INTEGER_TYPE("0L") + elif t.type == self.t_INTEGER: + tokens[i] = copy.copy(t) + # Strip off any trailing suffixes + tokens[i].value = str(tokens[i].value) + while tokens[i].value[-1] not in "0123456789abcdefABCDEF": + tokens[i].value = tokens[i].value[:-1] + + expr = "".join([str(x.value) for x in tokens]) + expr = expr.replace("&&"," and ") + expr = expr.replace("||"," or ") + expr = expr.replace("!"," not ") + try: + result = eval(expr) + except StandardError: + self.error(self.source,tokens[0].lineno,"Couldn't evaluate expression") + result = 0 + return result + + # ---------------------------------------------------------------------- + # parsegen() + # + # Parse an input string/ + # ---------------------------------------------------------------------- + def parsegen(self,input,source=None): + + # Replace trigraph sequences + t = trigraph(input) + lines = self.group_lines(t) + + if not source: + source = "" + + self.define("__FILE__ \"%s\"" % source) + + self.source = source + chunk = [] + enable = True + iftrigger = False + ifstack = [] + + for x in lines: + for i,tok in enumerate(x): + if tok.type not in self.t_WS: break + if tok.value == '#': + # Preprocessor directive + + for tok in x: + if tok in self.t_WS and '\n' in tok.value: + chunk.append(tok) + + dirtokens = self.tokenstrip(x[i+1:]) + if dirtokens: + name = dirtokens[0].value + args = self.tokenstrip(dirtokens[1:]) + else: + name = "" + args = [] + + if name == 'define': + if enable: + for tok in self.expand_macros(chunk): + yield tok + chunk = [] + self.define(args) + elif name == 'include': + if enable: + for tok in self.expand_macros(chunk): + yield tok + chunk = [] + oldfile = self.macros['__FILE__'] + for tok in self.include(args): + yield tok + self.macros['__FILE__'] = oldfile + self.source = source + elif name == 'undef': + if enable: + for tok in self.expand_macros(chunk): + yield tok + chunk = [] + self.undef(args) + elif name == 'ifdef': + ifstack.append((enable,iftrigger)) + if enable: + if not args[0].value in self.macros: + enable = False + iftrigger = False + else: + iftrigger = True + elif name == 'ifndef': + ifstack.append((enable,iftrigger)) + if enable: + if args[0].value in self.macros: + enable = False + iftrigger = False + else: + iftrigger = True + elif name == 'if': + ifstack.append((enable,iftrigger)) + if enable: + result = self.evalexpr(args) + if not result: + enable = False + iftrigger = False + else: + iftrigger = True + elif name == 'elif': + if ifstack: + if ifstack[-1][0]: # We only pay attention if outer "if" allows this + if enable: # If already true, we flip enable False + enable = False + elif not iftrigger: # If False, but not triggered yet, we'll check expression + result = self.evalexpr(args) + if result: + enable = True + iftrigger = True + else: + self.error(self.source,dirtokens[0].lineno,"Misplaced #elif") + + elif name == 'else': + if ifstack: + if ifstack[-1][0]: + if enable: + enable = False + elif not iftrigger: + enable = True + iftrigger = True + else: + self.error(self.source,dirtokens[0].lineno,"Misplaced #else") + + elif name == 'endif': + if ifstack: + enable,iftrigger = ifstack.pop() + else: + self.error(self.source,dirtokens[0].lineno,"Misplaced #endif") + else: + # Unknown preprocessor directive + pass + + else: + # Normal text + if enable: + chunk.extend(x) + + for tok in self.expand_macros(chunk): + yield tok + chunk = [] + + # ---------------------------------------------------------------------- + # include() + # + # Implementation of file-inclusion + # ---------------------------------------------------------------------- + + def include(self,tokens): + # Try to extract the filename and then process an include file + if not tokens: + return + if tokens: + if tokens[0].value != '<' and tokens[0].type != self.t_STRING: + tokens = self.expand_macros(tokens) + + if tokens[0].value == '<': + # Include <...> + i = 1 + while i < len(tokens): + if tokens[i].value == '>': + break + i += 1 + else: + print "Malformed #include <...>" + return + filename = "".join([x.value for x in tokens[1:i]]) + path = self.path + [""] + self.temp_path + elif tokens[0].type == self.t_STRING: + filename = tokens[0].value[1:-1] + path = self.temp_path + [""] + self.path + else: + print "Malformed #include statement" + return + for p in path: + iname = os.path.join(p,filename) + try: + data = open(iname,"r").read() + dname = os.path.dirname(iname) + if dname: + self.temp_path.insert(0,dname) + for tok in self.parsegen(data,filename): + yield tok + if dname: + del self.temp_path[0] + break + except IOError,e: + pass + else: + print "Couldn't find '%s'" % filename + + # ---------------------------------------------------------------------- + # define() + # + # Define a new macro + # ---------------------------------------------------------------------- + + def define(self,tokens): + if isinstance(tokens,(str,unicode)): + tokens = self.tokenize(tokens) + + linetok = tokens + try: + name = linetok[0] + if len(linetok) > 1: + mtype = linetok[1] + else: + mtype = None + if not mtype: + m = Macro(name.value,[]) + self.macros[name.value] = m + elif mtype.type in self.t_WS: + # A normal macro + m = Macro(name.value,self.tokenstrip(linetok[2:])) + self.macros[name.value] = m + elif mtype.value == '(': + # A macro with arguments + tokcount, args, positions = self.collect_args(linetok[1:]) + variadic = False + for a in args: + if variadic: + print "No more arguments may follow a variadic argument" + break + astr = "".join([str(_i.value) for _i in a]) + if astr == "...": + variadic = True + a[0].type = self.t_ID + a[0].value = '__VA_ARGS__' + variadic = True + del a[1:] + continue + elif astr[-3:] == "..." and a[0].type == self.t_ID: + variadic = True + del a[1:] + # If, for some reason, "." is part of the identifier, strip off the name for the purposes + # of macro expansion + if a[0].value[-3:] == '...': + a[0].value = a[0].value[:-3] + continue + if len(a) > 1 or a[0].type != self.t_ID: + print "Invalid macro argument" + break + else: + mvalue = self.tokenstrip(linetok[1+tokcount:]) + i = 0 + while i < len(mvalue): + if i+1 < len(mvalue): + if mvalue[i].type in self.t_WS and mvalue[i+1].value == '##': + del mvalue[i] + continue + elif mvalue[i].value == '##' and mvalue[i+1].type in self.t_WS: + del mvalue[i+1] + i += 1 + m = Macro(name.value,mvalue,[x[0].value for x in args],variadic) + self.macro_prescan(m) + self.macros[name.value] = m + else: + print "Bad macro definition" + except LookupError: + print "Bad macro definition" + + # ---------------------------------------------------------------------- + # undef() + # + # Undefine a macro + # ---------------------------------------------------------------------- + + def undef(self,tokens): + id = tokens[0].value + try: + del self.macros[id] + except LookupError: + pass + + # ---------------------------------------------------------------------- + # parse() + # + # Parse input text. + # ---------------------------------------------------------------------- + def parse(self,input,source=None,ignore={}): + self.ignore = ignore + self.parser = self.parsegen(input,source) + + # ---------------------------------------------------------------------- + # token() + # + # Method to return individual tokens + # ---------------------------------------------------------------------- + def token(self): + try: + while True: + tok = self.parser.next() + if tok.type not in self.ignore: return tok + except StopIteration: + self.parser = None + return None + +if __name__ == '__main__': + import ply.lex as lex + lexer = lex.lex() + + # Run a preprocessor + import sys + f = open(sys.argv[1]) + input = f.read() + + p = Preprocessor(lexer) + p.parse(input,sys.argv[1]) + while True: + tok = p.token() + if not tok: break + print p.source, tok + + + + + + + + + + + diff --git a/ext/ply/ply/ctokens.py b/ext/ply/ply/ctokens.py new file mode 100644 index 000000000..dd5f102dc --- /dev/null +++ b/ext/ply/ply/ctokens.py @@ -0,0 +1,133 @@ +# ---------------------------------------------------------------------- +# ctokens.py +# +# Token specifications for symbols in ANSI C and C++. This file is +# meant to be used as a library in other tokenizers. +# ---------------------------------------------------------------------- + +# Reserved words + +tokens = [ + # Literals (identifier, integer constant, float constant, string constant, char const) + 'ID', 'TYPEID', 'ICONST', 'FCONST', 'SCONST', 'CCONST', + + # Operators (+,-,*,/,%,|,&,~,^,<<,>>, ||, &&, !, <, <=, >, >=, ==, !=) + 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD', + 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT', + 'LOR', 'LAND', 'LNOT', + 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE', + + # Assignment (=, *=, /=, %=, +=, -=, <<=, >>=, &=, ^=, |=) + 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', 'PLUSEQUAL', 'MINUSEQUAL', + 'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL', 'OREQUAL', + + # Increment/decrement (++,--) + 'PLUSPLUS', 'MINUSMINUS', + + # Structure dereference (->) + 'ARROW', + + # Ternary operator (?) + 'TERNARY', + + # Delimeters ( ) [ ] { } , . ; : + 'LPAREN', 'RPAREN', + 'LBRACKET', 'RBRACKET', + 'LBRACE', 'RBRACE', + 'COMMA', 'PERIOD', 'SEMI', 'COLON', + + # Ellipsis (...) + 'ELLIPSIS', +] + +# Operators +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_MODULO = r'%' +t_OR = r'\|' +t_AND = r'&' +t_NOT = r'~' +t_XOR = r'\^' +t_LSHIFT = r'<<' +t_RSHIFT = r'>>' +t_LOR = r'\|\|' +t_LAND = r'&&' +t_LNOT = r'!' +t_LT = r'<' +t_GT = r'>' +t_LE = r'<=' +t_GE = r'>=' +t_EQ = r'==' +t_NE = r'!=' + +# Assignment operators + +t_EQUALS = r'=' +t_TIMESEQUAL = r'\*=' +t_DIVEQUAL = r'/=' +t_MODEQUAL = r'%=' +t_PLUSEQUAL = r'\+=' +t_MINUSEQUAL = r'-=' +t_LSHIFTEQUAL = r'<<=' +t_RSHIFTEQUAL = r'>>=' +t_ANDEQUAL = r'&=' +t_OREQUAL = r'\|=' +t_XOREQUAL = r'^=' + +# Increment/decrement +t_INCREMENT = r'\+\+' +t_DECREMENT = r'--' + +# -> +t_ARROW = r'->' + +# ? +t_TERNARY = r'\?' + +# Delimeters +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_LBRACKET = r'\[' +t_RBRACKET = r'\]' +t_LBRACE = r'\{' +t_RBRACE = r'\}' +t_COMMA = r',' +t_PERIOD = r'\.' +t_SEMI = r';' +t_COLON = r':' +t_ELLIPSIS = r'\.\.\.' + +# Identifiers +t_ID = r'[A-Za-z_][A-Za-z0-9_]*' + +# Integer literal +t_INTEGER = r'\d+([uU]|[lL]|[uU][lL]|[lL][uU])?' + +# Floating literal +t_FLOAT = r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?' + +# String literal +t_STRING = r'\"([^\\\n]|(\\.))*?\"' + +# Character constant 'c' or L'c' +t_CHARACTER = r'(L)?\'([^\\\n]|(\\.))*?\'' + +# Comment (C-Style) +def t_COMMENT(t): + r'/\*(.|\n)*?\*/' + t.lexer.lineno += t.value.count('\n') + return t + +# Comment (C++-Style) +def t_CPPCOMMENT(t): + r'//.*\n' + t.lexer.lineno += 1 + return t + + + + + + diff --git a/ext/ply/ply/lex.py b/ext/ply/ply/lex.py index 782b29286..4759d1b7a 100644 --- a/ext/ply/ply/lex.py +++ b/ext/ply/ply/lex.py @@ -1,67 +1,114 @@ -#----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # ply: lex.py # -# Author: David M. Beazley (dave@dabeaz.com) -# -# Copyright (C) 2001-2007, David M. Beazley -# -# This library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 2.1 of the License, or (at your option) any later version. +# Copyright (C) 2001-2009, +# David M. Beazley (Dabeaz LLC) +# All rights reserved. # -# This library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * Neither the name of the David Beazley or Dabeaz LLC may be used to +# endorse or promote products derived from this software without +# specific prior written permission. # -# You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -# -# See the file COPYING for a complete copy of the LGPL. -#----------------------------------------------------------------------------- - -__version__ = "2.3" - -import re, sys, types +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ----------------------------------------------------------------------------- -# Regular expression used to match valid token names -_is_identifier = re.compile(r'^[a-zA-Z0-9_]+$') +__version__ = "3.2" +__tabversion__ = "3.2" # Version of table file used -# Available instance types. This is used when lexers are defined by a class. -# It's a little funky because I want to preserve backwards compatibility -# with Python 2.0 where types.ObjectType is undefined. +import re, sys, types, copy, os +# This tuple contains known string types try: - _INSTANCETYPE = (types.InstanceType, types.ObjectType) + # Python 2.6 + StringTypes = (types.StringType, types.UnicodeType) except AttributeError: - _INSTANCETYPE = types.InstanceType - class object: pass # Note: needed if no new-style classes present + # Python 3.0 + StringTypes = (str, bytes) + +# Extract the code attribute of a function. Different implementations +# are for Python 2/3 compatibility. + +if sys.version_info[0] < 3: + def func_code(f): + return f.func_code +else: + def func_code(f): + return f.__code__ + +# This regular expression is used to match valid token names +_is_identifier = re.compile(r'^[a-zA-Z0-9_]+$') # Exception thrown when invalid token encountered and no default error # handler is defined. + class LexError(Exception): def __init__(self,message,s): self.args = (message,) self.text = s -# Token class +# Token class. This class is used to represent the tokens produced. class LexToken(object): def __str__(self): return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos) def __repr__(self): return str(self) - def skip(self,n): - self.lexer.skip(n) + +# This object is a stand-in for a logging object created by the +# logging module. + +class PlyLogger(object): + def __init__(self,f): + self.f = f + def critical(self,msg,*args,**kwargs): + self.f.write((msg % args) + "\n") + + def warning(self,msg,*args,**kwargs): + self.f.write("WARNING: "+ (msg % args) + "\n") + + def error(self,msg,*args,**kwargs): + self.f.write("ERROR: " + (msg % args) + "\n") + + info = critical + debug = critical + +# Null logger is used when no output is generated. Does nothing. +class NullLogger(object): + def __getattribute__(self,name): + return self + def __call__(self,*args,**kwargs): + return self # ----------------------------------------------------------------------------- -# Lexer class +# === Lexing Engine === # -# This class encapsulates all of the methods and data associated with a lexer. +# The following Lexer class implements the lexer runtime. There are only +# a few public methods and attributes: # # input() - Store a new string in the lexer # token() - Get the next token +# clone() - Clone the lexer +# +# lineno - Current line number +# lexpos - Current position in the input string # ----------------------------------------------------------------------------- class Lexer: @@ -73,6 +120,7 @@ class Lexer: self.lexretext = None # Current regular expression strings self.lexstatere = {} # Dictionary mapping lexer states to master regexs self.lexstateretext = {} # Dictionary mapping lexer states to regex strings + self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names self.lexstate = "INITIAL" # Current lexer state self.lexstatestack = [] # Stack of lexer states self.lexstateinfo = None # State information @@ -88,28 +136,10 @@ class Lexer: self.lexliterals = "" # Literal characters that can be passed through self.lexmodule = None # Module self.lineno = 1 # Current line number - self.lexdebug = 0 # Debugging mode self.lexoptimize = 0 # Optimized mode def clone(self,object=None): - c = Lexer() - c.lexstatere = self.lexstatere - c.lexstateinfo = self.lexstateinfo - c.lexstateretext = self.lexstateretext - c.lexstate = self.lexstate - c.lexstatestack = self.lexstatestack - c.lexstateignore = self.lexstateignore - c.lexstateerrorf = self.lexstateerrorf - c.lexreflags = self.lexreflags - c.lexdata = self.lexdata - c.lexpos = self.lexpos - c.lexlen = self.lexlen - c.lextokens = self.lextokens - c.lexdebug = self.lexdebug - c.lineno = self.lineno - c.lexoptimize = self.lexoptimize - c.lexliterals = self.lexliterals - c.lexmodule = self.lexmodule + c = copy.copy(self) # If the object parameter has been supplied, it means we are attaching the # lexer to a new object. In this case, we have to rebind all methods in @@ -133,27 +163,37 @@ class Lexer: for key, ef in self.lexstateerrorf.items(): c.lexstateerrorf[key] = getattr(object,ef.__name__) c.lexmodule = object - - # Set up other attributes - c.begin(c.lexstate) return c # ------------------------------------------------------------ # writetab() - Write lexer information to a table file # ------------------------------------------------------------ - def writetab(self,tabfile): - tf = open(tabfile+".py","w") + def writetab(self,tabfile,outputdir=""): + if isinstance(tabfile,types.ModuleType): + return + basetabfilename = tabfile.split(".")[-1] + filename = os.path.join(outputdir,basetabfilename)+".py" + tf = open(filename,"w") tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__)) + tf.write("_tabversion = %s\n" % repr(__version__)) tf.write("_lextokens = %s\n" % repr(self.lextokens)) tf.write("_lexreflags = %s\n" % repr(self.lexreflags)) tf.write("_lexliterals = %s\n" % repr(self.lexliterals)) tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo)) tabre = { } + # Collect all functions in the initial state + initial = self.lexstatere["INITIAL"] + initialfuncs = [] + for part in initial: + for f in part[1]: + if f and f[0]: + initialfuncs.append(f) + for key, lre in self.lexstatere.items(): titem = [] for i in range(len(lre)): - titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1]))) + titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1],self.lexstaterenames[key][i]))) tabre[key] = titem tf.write("_lexstatere = %s\n" % repr(tabre)) @@ -172,7 +212,19 @@ class Lexer: # readtab() - Read lexer information from a tab file # ------------------------------------------------------------ def readtab(self,tabfile,fdict): - exec "import %s as lextab" % tabfile + if isinstance(tabfile,types.ModuleType): + lextab = tabfile + else: + if sys.version_info[0] < 3: + exec("import %s as lextab" % tabfile) + else: + env = { } + exec("import %s as lextab" % tabfile, env,env) + lextab = env['lextab'] + + if getattr(lextab,"_tabversion","0.0") != __version__: + raise ImportError("Inconsistent PLY version") + self.lextokens = lextab._lextokens self.lexreflags = lextab._lexreflags self.lexliterals = lextab._lexliterals @@ -197,8 +249,10 @@ class Lexer: # input() - Push a new string into the lexer # ------------------------------------------------------------ def input(self,s): - if not (isinstance(s,types.StringType) or isinstance(s,types.UnicodeType)): - raise ValueError, "Expected a string" + # Pull off the first character to see if s looks like a string + c = s[:1] + if not isinstance(c,StringTypes): + raise ValueError("Expected a string") self.lexdata = s self.lexpos = 0 self.lexlen = len(s) @@ -207,8 +261,8 @@ class Lexer: # begin() - Changes the lexing state # ------------------------------------------------------------ def begin(self,state): - if not self.lexstatere.has_key(state): - raise ValueError, "Undefined state" + if not state in self.lexstatere: + raise ValueError("Undefined state") self.lexre = self.lexstatere[state] self.lexretext = self.lexstateretext[state] self.lexignore = self.lexstateignore.get(state,"") @@ -241,7 +295,7 @@ class Lexer: self.lexpos += n # ------------------------------------------------------------ - # token() - Return the next token from the Lexer + # opttoken() - Return the next token from the Lexer # # Note: This function has been carefully implemented to be as fast # as possible. Don't make changes unless you really know what @@ -265,43 +319,45 @@ class Lexer: m = lexre.match(lexdata,lexpos) if not m: continue - # Set last match in lexer so that rules can access it if they want - self.lexmatch = m - # Create a token for return tok = LexToken() tok.value = m.group() tok.lineno = self.lineno tok.lexpos = lexpos - tok.lexer = self - lexpos = m.end() i = m.lastindex func,tok.type = lexindexfunc[i] - self.lexpos = lexpos if not func: # If no token type was set, it's an ignored token - if tok.type: return tok - break + if tok.type: + self.lexpos = m.end() + return tok + else: + lexpos = m.end() + break - # if func not callable, it means it's an ignored token - if not callable(func): - break + lexpos = m.end() # If token is processed by a function, call it + + tok.lexer = self # Set additional attributes useful in token rules + self.lexmatch = m + self.lexpos = lexpos + newtok = func(tok) # Every function must return a token, if nothing, we just move to next token if not newtok: - lexpos = self.lexpos # This is here in case user has updated lexpos. + lexpos = self.lexpos # This is here in case user has updated lexpos. + lexignore = self.lexignore # This is here in case there was a state change break # Verify type of the token. If not in the token map, raise an error if not self.lexoptimize: - if not self.lextokens.has_key(newtok.type): - raise LexError, ("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( - func.func_code.co_filename, func.func_code.co_firstlineno, + if not newtok.type in self.lextokens: + raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( + func_code(func).co_filename, func_code(func).co_firstlineno, func.__name__, newtok.type),lexdata[lexpos:]) return newtok @@ -311,7 +367,6 @@ class Lexer: tok = LexToken() tok.value = lexdata[lexpos] tok.lineno = self.lineno - tok.lexer = self tok.type = tok.value tok.lexpos = lexpos self.lexpos = lexpos + 1 @@ -329,58 +384,60 @@ class Lexer: newtok = self.lexerrorf(tok) if lexpos == self.lexpos: # Error method didn't change text position at all. This is an error. - raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) + raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) lexpos = self.lexpos if not newtok: continue return newtok self.lexpos = lexpos - raise LexError, ("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:]) + raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:]) self.lexpos = lexpos + 1 if self.lexdata is None: - raise RuntimeError, "No input string given with input()" + raise RuntimeError("No input string given with input()") return None + # Iterator interface + def __iter__(self): + return self + + def next(self): + t = self.token() + if t is None: + raise StopIteration + return t + + __next__ = next + # ----------------------------------------------------------------------------- -# _validate_file() +# ==== Lex Builder === # -# This checks to see if there are duplicated t_rulename() functions or strings -# in the parser input file. This is done using a simple regular expression -# match on each line in the filename. +# The functions and classes below are used to collect lexing information +# and build a Lexer object from it. # ----------------------------------------------------------------------------- -def _validate_file(filename): - import os.path - base,ext = os.path.splitext(filename) - if ext != '.py': return 1 # No idea what the file is. Return OK +# ----------------------------------------------------------------------------- +# get_caller_module_dict() +# +# This function returns a dictionary containing all of the symbols defined within +# a caller further down the call stack. This is used to get the environment +# associated with the yacc() call if none was provided. +# ----------------------------------------------------------------------------- +def get_caller_module_dict(levels): try: - f = open(filename) - lines = f.readlines() - f.close() - except IOError: - return 1 # Oh well - - fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') - sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') - counthash = { } - linen = 1 - noerror = 1 - for l in lines: - m = fre.match(l) - if not m: - m = sre.match(l) - if m: - name = m.group(1) - prev = counthash.get(name) - if not prev: - counthash[name] = linen - else: - print >>sys.stderr, "%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev) - noerror = 0 - linen += 1 - return noerror + raise RuntimeError + except RuntimeError: + e,b,t = sys.exc_info() + f = t.tb_frame + while levels > 0: + f = f.f_back + levels -= 1 + ldict = f.f_globals.copy() + if f.f_globals != f.f_locals: + ldict.update(f.f_locals) + + return ldict # ----------------------------------------------------------------------------- # _funcs_to_names() @@ -389,11 +446,11 @@ def _validate_file(filename): # suitable for output to a table file # ----------------------------------------------------------------------------- -def _funcs_to_names(funclist): +def _funcs_to_names(funclist,namelist): result = [] - for f in funclist: + for f,name in zip(funclist,namelist): if f and f[0]: - result.append((f[0].__name__,f[1])) + result.append((name, f[1])) else: result.append(f) return result @@ -430,25 +487,27 @@ def _form_master_re(relist,reflags,ldict,toknames): # Build the index to function map for the matching engine lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1) + lexindexnames = lexindexfunc[:] + for f,i in lexre.groupindex.items(): handle = ldict.get(f,None) if type(handle) in (types.FunctionType, types.MethodType): - lexindexfunc[i] = (handle,toknames[handle.__name__]) + lexindexfunc[i] = (handle,toknames[f]) + lexindexnames[i] = f elif handle is not None: - # If rule was specified as a string, we build an anonymous - # callback function to carry out the action + lexindexnames[i] = f if f.find("ignore_") > 0: lexindexfunc[i] = (None,None) else: lexindexfunc[i] = (None, toknames[f]) - - return [(lexre,lexindexfunc)],[regex] - except Exception,e: + + return [(lexre,lexindexfunc)],[regex],[lexindexnames] + except Exception: m = int(len(relist)/2) if m == 0: m = 1 - llist, lre = _form_master_re(relist[:m],reflags,ldict,toknames) - rlist, rre = _form_master_re(relist[m:],reflags,ldict,toknames) - return llist+rlist, lre+rre + llist, lre, lnames = _form_master_re(relist[:m],reflags,ldict,toknames) + rlist, rre, rnames = _form_master_re(relist[m:],reflags,ldict,toknames) + return llist+rlist, lre+rre, lnames+rnames # ----------------------------------------------------------------------------- # def _statetoken(s,names) @@ -463,61 +522,376 @@ def _statetoken(s,names): nonstate = 1 parts = s.split("_") for i in range(1,len(parts)): - if not names.has_key(parts[i]) and parts[i] != 'ANY': break + if not parts[i] in names and parts[i] != 'ANY': break if i > 1: states = tuple(parts[1:i]) else: states = ('INITIAL',) if 'ANY' in states: - states = tuple(names.keys()) + states = tuple(names) tokenname = "_".join(parts[i:]) return (states,tokenname) + +# ----------------------------------------------------------------------------- +# LexerReflect() +# +# This class represents information needed to build a lexer as extracted from a +# user's input file. +# ----------------------------------------------------------------------------- +class LexerReflect(object): + def __init__(self,ldict,log=None,reflags=0): + self.ldict = ldict + self.error_func = None + self.tokens = [] + self.reflags = reflags + self.stateinfo = { 'INITIAL' : 'inclusive'} + self.files = {} + self.error = 0 + + if log is None: + self.log = PlyLogger(sys.stderr) + else: + self.log = log + + # Get all of the basic information + def get_all(self): + self.get_tokens() + self.get_literals() + self.get_states() + self.get_rules() + + # Validate all of the information + def validate_all(self): + self.validate_tokens() + self.validate_literals() + self.validate_rules() + return self.error + + # Get the tokens map + def get_tokens(self): + tokens = self.ldict.get("tokens",None) + if not tokens: + self.log.error("No token list is defined") + self.error = 1 + return + + if not isinstance(tokens,(list, tuple)): + self.log.error("tokens must be a list or tuple") + self.error = 1 + return + + if not tokens: + self.log.error("tokens is empty") + self.error = 1 + return + + self.tokens = tokens + + # Validate the tokens + def validate_tokens(self): + terminals = {} + for n in self.tokens: + if not _is_identifier.match(n): + self.log.error("Bad token name '%s'",n) + self.error = 1 + if n in terminals: + self.log.warning("Token '%s' multiply defined", n) + terminals[n] = 1 + + # Get the literals specifier + def get_literals(self): + self.literals = self.ldict.get("literals","") + + # Validate literals + def validate_literals(self): + try: + for c in self.literals: + if not isinstance(c,StringTypes) or len(c) > 1: + self.log.error("Invalid literal %s. Must be a single character", repr(c)) + self.error = 1 + continue + + except TypeError: + self.log.error("Invalid literals specification. literals must be a sequence of characters") + self.error = 1 + + def get_states(self): + self.states = self.ldict.get("states",None) + # Build statemap + if self.states: + if not isinstance(self.states,(tuple,list)): + self.log.error("states must be defined as a tuple or list") + self.error = 1 + else: + for s in self.states: + if not isinstance(s,tuple) or len(s) != 2: + self.log.error("Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')",repr(s)) + self.error = 1 + continue + name, statetype = s + if not isinstance(name,StringTypes): + self.log.error("State name %s must be a string", repr(name)) + self.error = 1 + continue + if not (statetype == 'inclusive' or statetype == 'exclusive'): + self.log.error("State type for state %s must be 'inclusive' or 'exclusive'",name) + self.error = 1 + continue + if name in self.stateinfo: + self.log.error("State '%s' already defined",name) + self.error = 1 + continue + self.stateinfo[name] = statetype + + # Get all of the symbols with a t_ prefix and sort them into various + # categories (functions, strings, error functions, and ignore characters) + + def get_rules(self): + tsymbols = [f for f in self.ldict if f[:2] == 't_' ] + + # Now build up a list of functions and a list of strings + + self.toknames = { } # Mapping of symbols to token names + self.funcsym = { } # Symbols defined as functions + self.strsym = { } # Symbols defined as strings + self.ignore = { } # Ignore strings by state + self.errorf = { } # Error functions by state + + for s in self.stateinfo: + self.funcsym[s] = [] + self.strsym[s] = [] + + if len(tsymbols) == 0: + self.log.error("No rules of the form t_rulename are defined") + self.error = 1 + return + + for f in tsymbols: + t = self.ldict[f] + states, tokname = _statetoken(f,self.stateinfo) + self.toknames[f] = tokname + + if hasattr(t,"__call__"): + if tokname == 'error': + for s in states: + self.errorf[s] = t + elif tokname == 'ignore': + line = func_code(t).co_firstlineno + file = func_code(t).co_filename + self.log.error("%s:%d: Rule '%s' must be defined as a string",file,line,t.__name__) + self.error = 1 + else: + for s in states: + self.funcsym[s].append((f,t)) + elif isinstance(t, StringTypes): + if tokname == 'ignore': + for s in states: + self.ignore[s] = t + if "\\" in t: + self.log.warning("%s contains a literal backslash '\\'",f) + + elif tokname == 'error': + self.log.error("Rule '%s' must be defined as a function", f) + self.error = 1 + else: + for s in states: + self.strsym[s].append((f,t)) + else: + self.log.error("%s not defined as a function or string", f) + self.error = 1 + + # Sort the functions by line number + for f in self.funcsym.values(): + if sys.version_info[0] < 3: + f.sort(lambda x,y: cmp(func_code(x[1]).co_firstlineno,func_code(y[1]).co_firstlineno)) + else: + # Python 3.0 + f.sort(key=lambda x: func_code(x[1]).co_firstlineno) + + # Sort the strings by regular expression length + for s in self.strsym.values(): + if sys.version_info[0] < 3: + s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) + else: + # Python 3.0 + s.sort(key=lambda x: len(x[1]),reverse=True) + + # Validate all of the t_rules collected + def validate_rules(self): + for state in self.stateinfo: + # Validate all rules defined by functions + + + + for fname, f in self.funcsym[state]: + line = func_code(f).co_firstlineno + file = func_code(f).co_filename + self.files[file] = 1 + + tokname = self.toknames[fname] + if isinstance(f, types.MethodType): + reqargs = 2 + else: + reqargs = 1 + nargs = func_code(f).co_argcount + if nargs > reqargs: + self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,f.__name__) + self.error = 1 + continue + + if nargs < reqargs: + self.log.error("%s:%d: Rule '%s' requires an argument", file,line,f.__name__) + self.error = 1 + continue + + if not f.__doc__: + self.log.error("%s:%d: No regular expression defined for rule '%s'",file,line,f.__name__) + self.error = 1 + continue + + try: + c = re.compile("(?P<%s>%s)" % (fname,f.__doc__), re.VERBOSE | self.reflags) + if c.match(""): + self.log.error("%s:%d: Regular expression for rule '%s' matches empty string", file,line,f.__name__) + self.error = 1 + except re.error: + _etype, e, _etrace = sys.exc_info() + self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s", file,line,f.__name__,e) + if '#' in f.__doc__: + self.log.error("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'",file,line, f.__name__) + self.error = 1 + + # Validate all rules defined by strings + for name,r in self.strsym[state]: + tokname = self.toknames[name] + if tokname == 'error': + self.log.error("Rule '%s' must be defined as a function", name) + self.error = 1 + continue + + if not tokname in self.tokens and tokname.find("ignore_") < 0: + self.log.error("Rule '%s' defined for an unspecified token %s",name,tokname) + self.error = 1 + continue + + try: + c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | self.reflags) + if (c.match("")): + self.log.error("Regular expression for rule '%s' matches empty string",name) + self.error = 1 + except re.error: + _etype, e, _etrace = sys.exc_info() + self.log.error("Invalid regular expression for rule '%s'. %s",name,e) + if '#' in r: + self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'",name) + self.error = 1 + + if not self.funcsym[state] and not self.strsym[state]: + self.log.error("No rules defined for state '%s'",state) + self.error = 1 + + # Validate the error function + efunc = self.errorf.get(state,None) + if efunc: + f = efunc + line = func_code(f).co_firstlineno + file = func_code(f).co_filename + self.files[file] = 1 + + if isinstance(f, types.MethodType): + reqargs = 2 + else: + reqargs = 1 + nargs = func_code(f).co_argcount + if nargs > reqargs: + self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,f.__name__) + self.error = 1 + + if nargs < reqargs: + self.log.error("%s:%d: Rule '%s' requires an argument", file,line,f.__name__) + self.error = 1 + + for f in self.files: + self.validate_file(f) + + + # ----------------------------------------------------------------------------- + # validate_file() + # + # This checks to see if there are duplicated t_rulename() functions or strings + # in the parser input file. This is done using a simple regular expression + # match on each line in the given file. + # ----------------------------------------------------------------------------- + + def validate_file(self,filename): + import os.path + base,ext = os.path.splitext(filename) + if ext != '.py': return # No idea what the file is. Return OK + + try: + f = open(filename) + lines = f.readlines() + f.close() + except IOError: + return # Couldn't find the file. Don't worry about it + + fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') + sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') + + counthash = { } + linen = 1 + for l in lines: + m = fre.match(l) + if not m: + m = sre.match(l) + if m: + name = m.group(1) + prev = counthash.get(name) + if not prev: + counthash[name] = linen + else: + self.log.error("%s:%d: Rule %s redefined. Previously defined on line %d",filename,linen,name,prev) + self.error = 1 + linen += 1 + # ----------------------------------------------------------------------------- # lex(module) # # Build all of the regular expression rules from definitions in the supplied module # ----------------------------------------------------------------------------- -def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0): +def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0,outputdir="", debuglog=None, errorlog=None): global lexer ldict = None stateinfo = { 'INITIAL' : 'inclusive'} - error = 0 - files = { } lexobj = Lexer() - lexobj.lexdebug = debug lexobj.lexoptimize = optimize global token,input - if nowarn: warn = 0 - else: warn = 1 + if errorlog is None: + errorlog = PlyLogger(sys.stderr) + + if debug: + if debuglog is None: + debuglog = PlyLogger(sys.stderr) + # Get the module dictionary used for the lexer if object: module = object if module: - # User supplied a module object. - if isinstance(module, types.ModuleType): - ldict = module.__dict__ - elif isinstance(module, _INSTANCETYPE): - _items = [(k,getattr(module,k)) for k in dir(module)] - ldict = { } - for (i,v) in _items: - ldict[i] = v - else: - raise ValueError,"Expected a module or instance" - lexobj.lexmodule = module - + _items = [(k,getattr(module,k)) for k in dir(module)] + ldict = dict(_items) else: - # No module given. We might be able to get information from the caller. - try: - raise RuntimeError - except RuntimeError: - e,b,t = sys.exc_info() - f = t.tb_frame - f = f.f_back # Walk out to our calling function - ldict = f.f_globals # Grab its globals dictionary + ldict = get_caller_module_dict(2) + + # Collect parser information from the dictionary + linfo = LexerReflect(ldict,log=errorlog,reflags=reflags) + linfo.get_all() + if not optimize: + if linfo.validate_all(): + raise SyntaxError("Can't build lexer") if optimize and lextab: try: @@ -530,280 +904,94 @@ def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,now except ImportError: pass - # Get the tokens, states, and literals variables (if any) - if (module and isinstance(module,_INSTANCETYPE)): - tokens = getattr(module,"tokens",None) - states = getattr(module,"states",None) - literals = getattr(module,"literals","") - else: - tokens = ldict.get("tokens",None) - states = ldict.get("states",None) - literals = ldict.get("literals","") - - if not tokens: - raise SyntaxError,"lex: module does not define 'tokens'" - if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)): - raise SyntaxError,"lex: tokens must be a list or tuple." + # Dump some basic debugging information + if debug: + debuglog.info("lex: tokens = %r", linfo.tokens) + debuglog.info("lex: literals = %r", linfo.literals) + debuglog.info("lex: states = %r", linfo.stateinfo) # Build a dictionary of valid token names lexobj.lextokens = { } - if not optimize: - for n in tokens: - if not _is_identifier.match(n): - print >>sys.stderr, "lex: Bad token name '%s'" % n - error = 1 - if warn and lexobj.lextokens.has_key(n): - print >>sys.stderr, "lex: Warning. Token '%s' multiply defined." % n - lexobj.lextokens[n] = None - else: - for n in tokens: lexobj.lextokens[n] = None - - if debug: - print "lex: tokens = '%s'" % lexobj.lextokens.keys() + for n in linfo.tokens: + lexobj.lextokens[n] = 1 - try: - for c in literals: - if not (isinstance(c,types.StringType) or isinstance(c,types.UnicodeType)) or len(c) > 1: - print >>sys.stderr, "lex: Invalid literal %s. Must be a single character" % repr(c) - error = 1 - continue - - except TypeError: - print >>sys.stderr, "lex: Invalid literals specification. literals must be a sequence of characters." - error = 1 - - lexobj.lexliterals = literals - - # Build statemap - if states: - if not (isinstance(states,types.TupleType) or isinstance(states,types.ListType)): - print >>sys.stderr, "lex: states must be defined as a tuple or list." - error = 1 - else: - for s in states: - if not isinstance(s,types.TupleType) or len(s) != 2: - print >>sys.stderr, "lex: invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')" % repr(s) - error = 1 - continue - name, statetype = s - if not isinstance(name,types.StringType): - print >>sys.stderr, "lex: state name %s must be a string" % repr(name) - error = 1 - continue - if not (statetype == 'inclusive' or statetype == 'exclusive'): - print >>sys.stderr, "lex: state type for state %s must be 'inclusive' or 'exclusive'" % name - error = 1 - continue - if stateinfo.has_key(name): - print >>sys.stderr, "lex: state '%s' already defined." % name - error = 1 - continue - stateinfo[name] = statetype - - # Get a list of symbols with the t_ or s_ prefix - tsymbols = [f for f in ldict.keys() if f[:2] == 't_' ] - - # Now build up a list of functions and a list of strings - - funcsym = { } # Symbols defined as functions - strsym = { } # Symbols defined as strings - toknames = { } # Mapping of symbols to token names - - for s in stateinfo.keys(): - funcsym[s] = [] - strsym[s] = [] - - ignore = { } # Ignore strings by state - errorf = { } # Error functions by state - - if len(tsymbols) == 0: - raise SyntaxError,"lex: no rules of the form t_rulename are defined." - - for f in tsymbols: - t = ldict[f] - states, tokname = _statetoken(f,stateinfo) - toknames[f] = tokname - - if callable(t): - for s in states: funcsym[s].append((f,t)) - elif (isinstance(t, types.StringType) or isinstance(t,types.UnicodeType)): - for s in states: strsym[s].append((f,t)) - else: - print >>sys.stderr, "lex: %s not defined as a function or string" % f - error = 1 - - # Sort the functions by line number - for f in funcsym.values(): - f.sort(lambda x,y: cmp(x[1].func_code.co_firstlineno,y[1].func_code.co_firstlineno)) + # Get literals specification + if isinstance(linfo.literals,(list,tuple)): + lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals) + else: + lexobj.lexliterals = linfo.literals - # Sort the strings by regular expression length - for s in strsym.values(): - s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) + # Get the stateinfo dictionary + stateinfo = linfo.stateinfo regexs = { } - # Build the master regular expressions - for state in stateinfo.keys(): + for state in stateinfo: regex_list = [] # Add rules defined by functions first - for fname, f in funcsym[state]: - line = f.func_code.co_firstlineno - file = f.func_code.co_filename - files[file] = None - tokname = toknames[fname] - - ismethod = isinstance(f, types.MethodType) - - if not optimize: - nargs = f.func_code.co_argcount - if ismethod: - reqargs = 2 - else: - reqargs = 1 - if nargs > reqargs: - print >>sys.stderr, "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__) - error = 1 - continue - - if nargs < reqargs: - print >>sys.stderr, "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__) - error = 1 - continue - - if tokname == 'ignore': - print >>sys.stderr, "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__) - error = 1 - continue - - if tokname == 'error': - errorf[state] = f - continue - - if f.__doc__: - if not optimize: - try: - c = re.compile("(?P<%s>%s)" % (f.__name__,f.__doc__), re.VERBOSE | reflags) - if c.match(""): - print >>sys.stderr, "%s:%d: Regular expression for rule '%s' matches empty string." % (file,line,f.__name__) - error = 1 - continue - except re.error,e: - print >>sys.stderr, "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e) - if '#' in f.__doc__: - print >>sys.stderr, "%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'." % (file,line, f.__name__) - error = 1 - continue - - if debug: - print "lex: Adding rule %s -> '%s' (state '%s')" % (f.__name__,f.__doc__, state) - - # Okay. The regular expression seemed okay. Let's append it to the master regular - # expression we're building - - regex_list.append("(?P<%s>%s)" % (f.__name__,f.__doc__)) - else: - print >>sys.stderr, "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__) + for fname, f in linfo.funcsym[state]: + line = func_code(f).co_firstlineno + file = func_code(f).co_filename + regex_list.append("(?P<%s>%s)" % (fname,f.__doc__)) + if debug: + debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",fname,f.__doc__, state) # Now add all of the simple rules - for name,r in strsym[state]: - tokname = toknames[name] - - if tokname == 'ignore': - if "\\" in r: - print >>sys.stderr, "lex: Warning. %s contains a literal backslash '\\'" % name - ignore[state] = r - continue - - if not optimize: - if tokname == 'error': - raise SyntaxError,"lex: Rule '%s' must be defined as a function" % name - error = 1 - continue - - if not lexobj.lextokens.has_key(tokname) and tokname.find("ignore_") < 0: - print >>sys.stderr, "lex: Rule '%s' defined for an unspecified token %s." % (name,tokname) - error = 1 - continue - try: - c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | reflags) - if (c.match("")): - print >>sys.stderr, "lex: Regular expression for rule '%s' matches empty string." % name - error = 1 - continue - except re.error,e: - print >>sys.stderr, "lex: Invalid regular expression for rule '%s'. %s" % (name,e) - if '#' in r: - print >>sys.stderr, "lex: Make sure '#' in rule '%s' is escaped with '\\#'." % name - - error = 1 - continue - if debug: - print "lex: Adding rule %s -> '%s' (state '%s')" % (name,r,state) - + for name,r in linfo.strsym[state]: regex_list.append("(?P<%s>%s)" % (name,r)) - - if not regex_list: - print >>sys.stderr, "lex: No rules defined for state '%s'" % state - error = 1 + if debug: + debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",name,r, state) regexs[state] = regex_list - - if not optimize: - for f in files.keys(): - if not _validate_file(f): - error = 1 - - if error: - raise SyntaxError,"lex: Unable to build lexer." - - # From this point forward, we're reasonably confident that we can build the lexer. - # No more errors will be generated, but there might be some warning messages. - # Build the master regular expressions - for state in regexs.keys(): - lexre, re_text = _form_master_re(regexs[state],reflags,ldict,toknames) + if debug: + debuglog.info("lex: ==== MASTER REGEXS FOLLOW ====") + + for state in regexs: + lexre, re_text, re_names = _form_master_re(regexs[state],reflags,ldict,linfo.toknames) lexobj.lexstatere[state] = lexre lexobj.lexstateretext[state] = re_text + lexobj.lexstaterenames[state] = re_names if debug: for i in range(len(re_text)): - print "lex: state '%s'. regex[%d] = '%s'" % (state, i, re_text[i]) + debuglog.info("lex: state '%s' : regex[%d] = '%s'",state, i, re_text[i]) - # For inclusive states, we need to add the INITIAL state - for state,type in stateinfo.items(): - if state != "INITIAL" and type == 'inclusive': + # For inclusive states, we need to add the regular expressions from the INITIAL state + for state,stype in stateinfo.items(): + if state != "INITIAL" and stype == 'inclusive': lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL']) lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL']) + lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL']) lexobj.lexstateinfo = stateinfo lexobj.lexre = lexobj.lexstatere["INITIAL"] lexobj.lexretext = lexobj.lexstateretext["INITIAL"] # Set up ignore variables - lexobj.lexstateignore = ignore + lexobj.lexstateignore = linfo.ignore lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","") # Set up error functions - lexobj.lexstateerrorf = errorf - lexobj.lexerrorf = errorf.get("INITIAL",None) - if warn and not lexobj.lexerrorf: - print >>sys.stderr, "lex: Warning. no t_error rule is defined." + lexobj.lexstateerrorf = linfo.errorf + lexobj.lexerrorf = linfo.errorf.get("INITIAL",None) + if not lexobj.lexerrorf: + errorlog.warning("No t_error rule is defined") # Check state information for ignore and error rules for s,stype in stateinfo.items(): if stype == 'exclusive': - if warn and not errorf.has_key(s): - print >>sys.stderr, "lex: Warning. no error rule is defined for exclusive state '%s'" % s - if warn and not ignore.has_key(s) and lexobj.lexignore: - print >>sys.stderr, "lex: Warning. no ignore rule is defined for exclusive state '%s'" % s + if not s in linfo.errorf: + errorlog.warning("No error rule is defined for exclusive state '%s'", s) + if not s in linfo.ignore and lexobj.lexignore: + errorlog.warning("No ignore rule is defined for exclusive state '%s'", s) elif stype == 'inclusive': - if not errorf.has_key(s): - errorf[s] = errorf.get("INITIAL",None) - if not ignore.has_key(s): - ignore[s] = ignore.get("INITIAL","") - + if not s in linfo.errorf: + linfo.errorf[s] = linfo.errorf.get("INITIAL",None) + if not s in linfo.ignore: + linfo.ignore[s] = linfo.ignore.get("INITIAL","") # Create global versions of the token() and input() functions token = lexobj.token @@ -812,7 +1000,7 @@ def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,now # If in optimize mode, we write the lextab if lextab and optimize: - lexobj.writetab(lextab) + lexobj.writetab(lextab,outputdir) return lexobj @@ -830,7 +1018,7 @@ def runmain(lexer=None,data=None): data = f.read() f.close() except IndexError: - print "Reading from standard input (type EOF to end):" + sys.stdout.write("Reading from standard input (type EOF to end):\n") data = sys.stdin.read() if lexer: @@ -846,8 +1034,7 @@ def runmain(lexer=None,data=None): while 1: tok = _token() if not tok: break - print "(%s,%r,%d,%d)" % (tok.type, tok.value, tok.lineno,tok.lexpos) - + sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno,tok.lexpos)) # ----------------------------------------------------------------------------- # @TOKEN(regex) @@ -858,7 +1045,10 @@ def runmain(lexer=None,data=None): def TOKEN(r): def set_doc(f): - f.__doc__ = r + if hasattr(r,"__call__"): + f.__doc__ = r.__doc__ + else: + f.__doc__ = r return f return set_doc diff --git a/ext/ply/ply/yacc.py b/ext/ply/ply/yacc.py index 39c17a9ed..3bf6e8e4d 100644 --- a/ext/ply/ply/yacc.py +++ b/ext/ply/ply/yacc.py @@ -1,26 +1,35 @@ -#----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # ply: yacc.py # -# Author(s): David M. Beazley (dave@dabeaz.com) -# -# Copyright (C) 2001-2007, David M. Beazley -# -# This library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 2.1 of the License, or (at your option) any later version. -# -# This library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -# -# See the file COPYING for a complete copy of the LGPL. -# +# Copyright (C) 2001-2009, +# David M. Beazley (Dabeaz LLC) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * Neither the name of the David Beazley or Dabeaz LLC may be used to +# endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ----------------------------------------------------------------------------- # # This implements an LR parser that is constructed from grammar rules defined # as Python functions. The grammer is specified by supplying the BNF inside @@ -50,7 +59,8 @@ # own risk! # ---------------------------------------------------------------------------- -__version__ = "2.3" +__version__ = "3.2" +__tabversion__ = "3.2" # Table version #----------------------------------------------------------------------------- # === User configurable parameters === @@ -67,20 +77,86 @@ default_lr = 'LALR' # Default LR table generation method error_count = 3 # Number of symbols that must be shifted to leave recovery mode -import re, types, sys, cStringIO, md5, os.path +yaccdevel = 0 # Set to True if developing yacc. This turns off optimized + # implementations of certain functions. -# Exception raised for yacc-related errors -class YaccError(Exception): pass +resultlimit = 40 # Size limit of results when running in debug mode. + +pickle_protocol = 0 # Protocol to use when writing pickle files -# Available instance types. This is used when parsers are defined by a class. -# it's a little funky because I want to preserve backwards compatibility -# with Python 2.0 where types.ObjectType is undefined. +import re, types, sys, os.path +# Compatibility function for python 2.6/3.0 +if sys.version_info[0] < 3: + def func_code(f): + return f.func_code +else: + def func_code(f): + return f.__code__ + +# Compatibility try: - _INSTANCETYPE = (types.InstanceType, types.ObjectType) + MAXINT = sys.maxint except AttributeError: - _INSTANCETYPE = types.InstanceType - class object: pass # Note: needed if no new-style classes present + MAXINT = sys.maxsize + +# Python 2.x/3.0 compatibility. +def load_ply_lex(): + if sys.version_info[0] < 3: + import lex + else: + import ply.lex as lex + return lex + +# This object is a stand-in for a logging object created by the +# logging module. PLY will use this by default to create things +# such as the parser.out file. If a user wants more detailed +# information, they can create their own logging object and pass +# it into PLY. + +class PlyLogger(object): + def __init__(self,f): + self.f = f + def debug(self,msg,*args,**kwargs): + self.f.write((msg % args) + "\n") + info = debug + + def warning(self,msg,*args,**kwargs): + self.f.write("WARNING: "+ (msg % args) + "\n") + + def error(self,msg,*args,**kwargs): + self.f.write("ERROR: " + (msg % args) + "\n") + + critical = debug + +# Null logger is used when no output is generated. Does nothing. +class NullLogger(object): + def __getattribute__(self,name): + return self + def __call__(self,*args,**kwargs): + return self + +# Exception raised for yacc-related errors +class YaccError(Exception): pass + +# Format the result message that the parser produces when running in debug mode. +def format_result(r): + repr_str = repr(r) + if '\n' in repr_str: repr_str = repr(repr_str) + if len(repr_str) > resultlimit: + repr_str = repr_str[:resultlimit]+" ..." + result = "<%s @ 0x%x> (%s)" % (type(r).__name__,id(r),repr_str) + return result + + +# Format stack entries when the parser is running in debug mode +def format_stack_entry(r): + repr_str = repr(r) + if '\n' in repr_str: repr_str = repr(repr_str) + if len(repr_str) < 16: + return repr_str + else: + return "<%s @ 0x%x>" % (type(r).__name__,id(r)) #----------------------------------------------------------------------------- # === LR Parsing Engine === @@ -99,7 +175,7 @@ except AttributeError: # .lexpos = Starting lex position # .endlexpos = Ending lex position (optional, set automatically) -class YaccSymbol(object): +class YaccSymbol: def __str__(self): return self.type def __repr__(self): return str(self) @@ -115,8 +191,9 @@ class YaccSymbol(object): class YaccProduction: def __init__(self,s,stack=None): self.slice = s - self.pbstack = [] self.stack = stack + self.lexer = None + self.parser= None def __getitem__(self,n): if n >= 0: return self.slice[n].value else: return self.stack[n].value @@ -133,6 +210,9 @@ class YaccProduction: def lineno(self,n): return getattr(self.slice[n],"lineno",0) + def set_lineno(self,n,lineno): + self.slice[n].lineno = n + def linespan(self,n): startline = getattr(self.slice[n],"lineno",0) endline = getattr(self.slice[n],"endlineno",startline) @@ -146,35 +226,22 @@ class YaccProduction: endpos = getattr(self.slice[n],"endlexpos",startpos) return startpos,endpos - def pushback(self,n): - if n <= 0: - raise ValueError, "Expected a positive value" - if n > (len(self.slice)-1): - raise ValueError, "Can't push %d tokens. Only %d are available." % (n,len(self.slice)-1) - for i in range(0,n): - self.pbstack.append(self.slice[-i-1]) + def error(self): + raise SyntaxError -# The LR Parsing engine. This is defined as a class so that multiple parsers -# can exist in the same process. A user never instantiates this directly. -# Instead, the global yacc() function should be used to create a suitable Parser -# object. -class Parser: - def __init__(self,magic=None): - - # This is a hack to keep users from trying to instantiate a Parser - # object directly. - - if magic != "xyzzy": - raise YaccError, "Can't instantiate Parser. Use yacc() instead." +# ----------------------------------------------------------------------------- +# == LRParser == +# +# The LR Parsing engine. +# ----------------------------------------------------------------------------- - # Reset internal state - self.productions = None # List of productions - self.errorfunc = None # Error handling function - self.action = { } # LR Action table - self.goto = { } # LR goto table - self.require = { } # Attribute require table - self.method = "Unknown LR" # Table construction method used +class LRParser: + def __init__(self,lrtab,errorf): + self.productions = lrtab.lr_productions + self.action = lrtab.lr_action + self.goto = lrtab.lr_goto + self.errorfunc = errorf def errok(self): self.errorok = 1 @@ -187,29 +254,64 @@ class Parser: self.symstack.append(sym) self.statestack.append(0) - def parse(self,input=None,lexer=None,debug=0,tracking=0): + def parse(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): + if debug or yaccdevel: + if isinstance(debug,int): + debug = PlyLogger(sys.stderr) + return self.parsedebug(input,lexer,debug,tracking,tokenfunc) + elif tracking: + return self.parseopt(input,lexer,debug,tracking,tokenfunc) + else: + return self.parseopt_notrack(input,lexer,debug,tracking,tokenfunc) + + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # parsedebug(). + # + # This is the debugging enabled version of parse(). All changes made to the + # parsing engine should be made here. For the non-debugging version, + # copy this code to a method parseopt() and delete all of the sections + # enclosed in: + # + # #--! DEBUG + # statements + # #--! DEBUG + # + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + def parsedebug(self,input=None,lexer=None,debug=None,tracking=0,tokenfunc=None): lookahead = None # Current lookahead symbol lookaheadstack = [ ] # Stack of lookahead symbols - actions = self.action # Local reference to action table - goto = self.goto # Local reference to goto table - prod = self.productions # Local reference to production list + actions = self.action # Local reference to action table (to avoid lookup on self.) + goto = self.goto # Local reference to goto table (to avoid lookup on self.) + prod = self.productions # Local reference to production list (to avoid lookup on self.) pslice = YaccProduction(None) # Production object passed to grammar rules - errorcount = 0 # Used during error recovery + errorcount = 0 # Used during error recovery + + # --! DEBUG + debug.info("PLY: PARSE DEBUG START") + # --! DEBUG # If no lexer was given, we will try to use the lex module if not lexer: - import lex + lex = load_ply_lex() lexer = lex.lexer + # Set up the lexer and parser objects on pslice pslice.lexer = lexer pslice.parser = self # If input was supplied, pass to lexer - if input: + if input is not None: lexer.input(input) - # Tokenize function - get_token = lexer.token + if tokenfunc is None: + # Tokenize function + get_token = lexer.token + else: + get_token = tokenfunc + + # Set up the state and symbol stacks statestack = [ ] # Stack of parsing states self.statestack = statestack @@ -223,15 +325,19 @@ class Parser: statestack.append(0) sym = YaccSymbol() - sym.type = '$end' + sym.type = "$end" symstack.append(sym) state = 0 while 1: # Get the next symbol on the input. If a lookahead symbol # is already set, we just use that. Otherwise, we'll pull # the next token off of the lookaheadstack or from the lexer - if debug > 1: - print 'state', state + + # --! DEBUG + debug.debug('') + debug.debug('State : %s', state) + # --! DEBUG + if not lookahead: if not lookaheadstack: lookahead = get_token() # Get the next token @@ -239,27 +345,27 @@ class Parser: lookahead = lookaheadstack.pop() if not lookahead: lookahead = YaccSymbol() - lookahead.type = '$end' - if debug: - errorlead = ("%s . %s" % (" ".join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip() + lookahead.type = "$end" + + # --! DEBUG + debug.debug('Stack : %s', + ("%s . %s" % (" ".join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()) + # --! DEBUG # Check the action table ltype = lookahead.type t = actions[state].get(ltype) - if debug > 1: - print 'action', t if t is not None: if t > 0: # shift a symbol on the stack - if ltype == '$end': - # Error, end of input - sys.stderr.write("yacc: Parse error. EOF\n") - return statestack.append(t) state = t - if debug > 1: - sys.stderr.write("%-60s shift state %s\n" % (errorlead, t)) + + # --! DEBUG + debug.debug("Action : Shift and goto state %s", t) + # --! DEBUG + symstack.append(lookahead) lookahead = None @@ -277,12 +383,20 @@ class Parser: sym = YaccSymbol() sym.type = pname # Production name sym.value = None - if debug > 1: - sys.stderr.write("%-60s reduce %d\n" % (errorlead, -t)) + + # --! DEBUG + if plen: + debug.info("Action : Reduce rule [%s] with %s and goto state %d", p.str, "["+",".join([format_stack_entry(_v.value) for _v in symstack[-plen:]])+"]",-t) + else: + debug.info("Action : Reduce rule [%s] with %s and goto state %d", p.str, [],-t) + + # --! DEBUG if plen: targ = symstack[-plen-1:] targ[0] = sym + + # --! TRACKING if tracking: t1 = targ[1] sym.lineno = t1.lineno @@ -290,38 +404,368 @@ class Parser: t1 = targ[-1] sym.endlineno = getattr(t1,"endlineno",t1.lineno) sym.endlexpos = getattr(t1,"endlexpos",t1.lexpos) - del symstack[-plen:] - del statestack[-plen:] + + # --! TRACKING + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # The code enclosed in this section is duplicated + # below as a performance optimization. Make sure + # changes get made in both locations. + + pslice.slice = targ + + try: + # Call the grammar rule with our special slice object + del symstack[-plen:] + del statestack[-plen:] + p.callable(pslice) + # --! DEBUG + debug.info("Result : %s", format_result(pslice[0])) + # --! DEBUG + symstack.append(sym) + state = goto[statestack[-1]][pname] + statestack.append(state) + except SyntaxError: + # If an error was set. Enter error recovery state + lookaheadstack.append(lookahead) + symstack.pop() + statestack.pop() + state = statestack[-1] + sym.type = 'error' + lookahead = sym + errorcount = error_count + self.errorok = 0 + continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + else: + + # --! TRACKING if tracking: sym.lineno = lexer.lineno sym.lexpos = lexer.lexpos + # --! TRACKING + targ = [ sym ] - pslice.slice = targ - # Call the grammar rule with our special slice object - p.func(pslice) + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # The code enclosed in this section is duplicated + # above as a performance optimization. Make sure + # changes get made in both locations. + + pslice.slice = targ + + try: + # Call the grammar rule with our special slice object + p.callable(pslice) + # --! DEBUG + debug.info("Result : %s", format_result(pslice[0])) + # --! DEBUG + symstack.append(sym) + state = goto[statestack[-1]][pname] + statestack.append(state) + except SyntaxError: + # If an error was set. Enter error recovery state + lookaheadstack.append(lookahead) + symstack.pop() + statestack.pop() + state = statestack[-1] + sym.type = 'error' + lookahead = sym + errorcount = error_count + self.errorok = 0 + continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + if t == 0: + n = symstack[-1] + result = getattr(n,"value",None) + # --! DEBUG + debug.info("Done : Returning %s", format_result(result)) + debug.info("PLY: PARSE DEBUG END") + # --! DEBUG + return result + + if t == None: + + # --! DEBUG + debug.error('Error : %s', + ("%s . %s" % (" ".join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()) + # --! DEBUG + + # We have some kind of parsing error here. To handle + # this, we are going to push the current token onto + # the tokenstack and replace it with an 'error' token. + # If there are any synchronization rules, they may + # catch it. + # + # In addition to pushing the error token, we call call + # the user defined p_error() function if this is the + # first syntax error. This function is only called if + # errorcount == 0. + if errorcount == 0 or self.errorok: + errorcount = error_count + self.errorok = 0 + errtoken = lookahead + if errtoken.type == "$end": + errtoken = None # End of file! + if self.errorfunc: + global errok,token,restart + errok = self.errok # Set some special functions available in error recovery + token = get_token + restart = self.restart + if errtoken and not hasattr(errtoken,'lexer'): + errtoken.lexer = lexer + tok = self.errorfunc(errtoken) + del errok, token, restart # Delete special functions - # If there was a pushback, put that on the stack - if pslice.pbstack: - lookaheadstack.append(lookahead) - for _t in pslice.pbstack: - lookaheadstack.append(_t) + if self.errorok: + # User must have done some kind of panic + # mode recovery on their own. The + # returned token is the next lookahead + lookahead = tok + errtoken = None + continue + else: + if errtoken: + if hasattr(errtoken,"lineno"): lineno = lookahead.lineno + else: lineno = 0 + if lineno: + sys.stderr.write("yacc: Syntax error at line %d, token=%s\n" % (lineno, errtoken.type)) + else: + sys.stderr.write("yacc: Syntax error, token=%s" % errtoken.type) + else: + sys.stderr.write("yacc: Parse error in input. EOF\n") + return + + else: + errorcount = error_count + + # case 1: the statestack only has 1 entry on it. If we're in this state, the + # entire parse has been rolled back and we're completely hosed. The token is + # discarded and we just keep going. + + if len(statestack) <= 1 and lookahead.type != "$end": + lookahead = None + errtoken = None + state = 0 + # Nuke the pushback stack + del lookaheadstack[:] + continue + + # case 2: the statestack has a couple of entries on it, but we're + # at the end of the file. nuke the top entry and generate an error token + + # Start nuking entries on the stack + if lookahead.type == "$end": + # Whoa. We're really hosed here. Bail out + return + + if lookahead.type != 'error': + sym = symstack[-1] + if sym.type == 'error': + # Hmmm. Error is on top of stack, we'll just nuke input + # symbol and continue lookahead = None - pslice.pbstack = [] + continue + t = YaccSymbol() + t.type = 'error' + if hasattr(lookahead,"lineno"): + t.lineno = lookahead.lineno + t.value = lookahead + lookaheadstack.append(lookahead) + lookahead = t + else: + symstack.pop() + statestack.pop() + state = statestack[-1] # Potential bug fix + + continue + + # Call an error function here + raise RuntimeError("yacc: internal parser error!!!\n") + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # parseopt(). + # + # Optimized version of parse() method. DO NOT EDIT THIS CODE DIRECTLY. + # Edit the debug version above, then copy any modifications to the method + # below while removing #--! DEBUG sections. + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + + def parseopt(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): + lookahead = None # Current lookahead symbol + lookaheadstack = [ ] # Stack of lookahead symbols + actions = self.action # Local reference to action table (to avoid lookup on self.) + goto = self.goto # Local reference to goto table (to avoid lookup on self.) + prod = self.productions # Local reference to production list (to avoid lookup on self.) + pslice = YaccProduction(None) # Production object passed to grammar rules + errorcount = 0 # Used during error recovery + + # If no lexer was given, we will try to use the lex module + if not lexer: + lex = load_ply_lex() + lexer = lex.lexer + + # Set up the lexer and parser objects on pslice + pslice.lexer = lexer + pslice.parser = self + + # If input was supplied, pass to lexer + if input is not None: + lexer.input(input) + + if tokenfunc is None: + # Tokenize function + get_token = lexer.token + else: + get_token = tokenfunc + + # Set up the state and symbol stacks + + statestack = [ ] # Stack of parsing states + self.statestack = statestack + symstack = [ ] # Stack of grammar symbols + self.symstack = symstack + + pslice.stack = symstack # Put in the production + errtoken = None # Err token + + # The start state is assumed to be (0,$end) + + statestack.append(0) + sym = YaccSymbol() + sym.type = '$end' + symstack.append(sym) + state = 0 + while 1: + # Get the next symbol on the input. If a lookahead symbol + # is already set, we just use that. Otherwise, we'll pull + # the next token off of the lookaheadstack or from the lexer + + if not lookahead: + if not lookaheadstack: + lookahead = get_token() # Get the next token + else: + lookahead = lookaheadstack.pop() + if not lookahead: + lookahead = YaccSymbol() + lookahead.type = '$end' - symstack.append(sym) - state = goto[statestack[-1]][pname] - statestack.append(state) + # Check the action table + ltype = lookahead.type + t = actions[state].get(ltype) + + if t is not None: + if t > 0: + # shift a symbol on the stack + statestack.append(t) + state = t + + symstack.append(lookahead) + lookahead = None + + # Decrease error count on successful shift + if errorcount: errorcount -=1 continue + if t < 0: + # reduce a symbol on the stack, emit a production + p = prod[-t] + pname = p.name + plen = p.len + + # Get production function + sym = YaccSymbol() + sym.type = pname # Production name + sym.value = None + + if plen: + targ = symstack[-plen-1:] + targ[0] = sym + + # --! TRACKING + if tracking: + t1 = targ[1] + sym.lineno = t1.lineno + sym.lexpos = t1.lexpos + t1 = targ[-1] + sym.endlineno = getattr(t1,"endlineno",t1.lineno) + sym.endlexpos = getattr(t1,"endlexpos",t1.lexpos) + + # --! TRACKING + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # The code enclosed in this section is duplicated + # below as a performance optimization. Make sure + # changes get made in both locations. + + pslice.slice = targ + + try: + # Call the grammar rule with our special slice object + del symstack[-plen:] + del statestack[-plen:] + p.callable(pslice) + symstack.append(sym) + state = goto[statestack[-1]][pname] + statestack.append(state) + except SyntaxError: + # If an error was set. Enter error recovery state + lookaheadstack.append(lookahead) + symstack.pop() + statestack.pop() + state = statestack[-1] + sym.type = 'error' + lookahead = sym + errorcount = error_count + self.errorok = 0 + continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + else: + + # --! TRACKING + if tracking: + sym.lineno = lexer.lineno + sym.lexpos = lexer.lexpos + # --! TRACKING + + targ = [ sym ] + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # The code enclosed in this section is duplicated + # above as a performance optimization. Make sure + # changes get made in both locations. + + pslice.slice = targ + + try: + # Call the grammar rule with our special slice object + p.callable(pslice) + symstack.append(sym) + state = goto[statestack[-1]][pname] + statestack.append(state) + except SyntaxError: + # If an error was set. Enter error recovery state + lookaheadstack.append(lookahead) + symstack.pop() + statestack.pop() + state = statestack[-1] + sym.type = 'error' + lookahead = sym + errorcount = error_count + self.errorok = 0 + continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + if t == 0: n = symstack[-1] return getattr(n,"value",None) if t == None: - if debug: - sys.stderr.write(errorlead + "\n") + # We have some kind of parsing error here. To handle # this, we are going to push the current token onto # the tokenstack and replace it with an 'error' token. @@ -343,6 +787,8 @@ class Parser: errok = self.errok # Set some special functions available in error recovery token = get_token restart = self.restart + if errtoken and not hasattr(errtoken,'lexer'): + errtoken.lexer = lexer tok = self.errorfunc(errtoken) del errok, token, restart # Delete special functions @@ -375,6 +821,7 @@ class Parser: if len(statestack) <= 1 and lookahead.type != '$end': lookahead = None errtoken = None + state = 0 # Nuke the pushback stack del lookaheadstack[:] continue @@ -404,1086 +851,1033 @@ class Parser: else: symstack.pop() statestack.pop() + state = statestack[-1] # Potential bug fix continue # Call an error function here - raise RuntimeError, "yacc: internal parser error!!!\n" + raise RuntimeError("yacc: internal parser error!!!\n") -# ----------------------------------------------------------------------------- -# === Parser Construction === -# -# The following functions and variables are used to implement the yacc() function -# itself. This is pretty hairy stuff involving lots of error checking, -# construction of LR items, kernels, and so forth. Although a lot of -# this work is done using global variables, the resulting Parser object -# is completely self contained--meaning that it is safe to repeatedly -# call yacc() with different grammars in the same application. -# ----------------------------------------------------------------------------- + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # parseopt_notrack(). + # + # Optimized version of parseopt() with line number tracking removed. + # DO NOT EDIT THIS CODE DIRECTLY. Copy the optimized version and remove + # code in the #--! TRACKING sections + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# ----------------------------------------------------------------------------- -# validate_file() -# -# This function checks to see if there are duplicated p_rulename() functions -# in the parser module file. Without this function, it is really easy for -# users to make mistakes by cutting and pasting code fragments (and it's a real -# bugger to try and figure out why the resulting parser doesn't work). Therefore, -# we just do a little regular expression pattern matching of def statements -# to try and detect duplicates. -# ----------------------------------------------------------------------------- + def parseopt_notrack(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): + lookahead = None # Current lookahead symbol + lookaheadstack = [ ] # Stack of lookahead symbols + actions = self.action # Local reference to action table (to avoid lookup on self.) + goto = self.goto # Local reference to goto table (to avoid lookup on self.) + prod = self.productions # Local reference to production list (to avoid lookup on self.) + pslice = YaccProduction(None) # Production object passed to grammar rules + errorcount = 0 # Used during error recovery + + # If no lexer was given, we will try to use the lex module + if not lexer: + lex = load_ply_lex() + lexer = lex.lexer + + # Set up the lexer and parser objects on pslice + pslice.lexer = lexer + pslice.parser = self -def validate_file(filename): - base,ext = os.path.splitext(filename) - if ext != '.py': return 1 # No idea. Assume it's okay. + # If input was supplied, pass to lexer + if input is not None: + lexer.input(input) - try: - f = open(filename) - lines = f.readlines() - f.close() - except IOError: - return 1 # Oh well - - # Match def p_funcname( - fre = re.compile(r'\s*def\s+(p_[a-zA-Z_0-9]*)\(') - counthash = { } - linen = 1 - noerror = 1 - for l in lines: - m = fre.match(l) - if m: - name = m.group(1) - prev = counthash.get(name) - if not prev: - counthash[name] = linen - else: - sys.stderr.write("%s:%d: Function %s redefined. Previously defined on line %d\n" % (filename,linen,name,prev)) - noerror = 0 - linen += 1 - return noerror - -# This function looks for functions that might be grammar rules, but which don't have the proper p_suffix. -def validate_dict(d): - for n,v in d.items(): - if n[0:2] == 'p_' and type(v) in (types.FunctionType, types.MethodType): continue - if n[0:2] == 't_': continue - - if n[0:2] == 'p_': - sys.stderr.write("yacc: Warning. '%s' not defined as a function\n" % n) - if 1 and isinstance(v,types.FunctionType) and v.func_code.co_argcount == 1: - try: - doc = v.__doc__.split(" ") - if doc[1] == ':': - sys.stderr.write("%s:%d: Warning. Possible grammar rule '%s' defined without p_ prefix.\n" % (v.func_code.co_filename, v.func_code.co_firstlineno,n)) - except StandardError: - pass + if tokenfunc is None: + # Tokenize function + get_token = lexer.token + else: + get_token = tokenfunc -# ----------------------------------------------------------------------------- -# === GRAMMAR FUNCTIONS === -# -# The following global variables and functions are used to store, manipulate, -# and verify the grammar rules specified by the user. -# ----------------------------------------------------------------------------- + # Set up the state and symbol stacks -# Initialize all of the global variables used during grammar construction -def initialize_vars(): - global Productions, Prodnames, Prodmap, Terminals - global Nonterminals, First, Follow, Precedence, LRitems - global Errorfunc, Signature, Requires + statestack = [ ] # Stack of parsing states + self.statestack = statestack + symstack = [ ] # Stack of grammar symbols + self.symstack = symstack - Productions = [None] # A list of all of the productions. The first - # entry is always reserved for the purpose of - # building an augmented grammar + pslice.stack = symstack # Put in the production + errtoken = None # Err token + + # The start state is assumed to be (0,$end) + + statestack.append(0) + sym = YaccSymbol() + sym.type = '$end' + symstack.append(sym) + state = 0 + while 1: + # Get the next symbol on the input. If a lookahead symbol + # is already set, we just use that. Otherwise, we'll pull + # the next token off of the lookaheadstack or from the lexer + + if not lookahead: + if not lookaheadstack: + lookahead = get_token() # Get the next token + else: + lookahead = lookaheadstack.pop() + if not lookahead: + lookahead = YaccSymbol() + lookahead.type = '$end' + + # Check the action table + ltype = lookahead.type + t = actions[state].get(ltype) + + if t is not None: + if t > 0: + # shift a symbol on the stack + statestack.append(t) + state = t + + symstack.append(lookahead) + lookahead = None + + # Decrease error count on successful shift + if errorcount: errorcount -=1 + continue + + if t < 0: + # reduce a symbol on the stack, emit a production + p = prod[-t] + pname = p.name + plen = p.len + + # Get production function + sym = YaccSymbol() + sym.type = pname # Production name + sym.value = None + + if plen: + targ = symstack[-plen-1:] + targ[0] = sym + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # The code enclosed in this section is duplicated + # below as a performance optimization. Make sure + # changes get made in both locations. + + pslice.slice = targ + + try: + # Call the grammar rule with our special slice object + del symstack[-plen:] + del statestack[-plen:] + p.callable(pslice) + symstack.append(sym) + state = goto[statestack[-1]][pname] + statestack.append(state) + except SyntaxError: + # If an error was set. Enter error recovery state + lookaheadstack.append(lookahead) + symstack.pop() + statestack.pop() + state = statestack[-1] + sym.type = 'error' + lookahead = sym + errorcount = error_count + self.errorok = 0 + continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + else: + + targ = [ sym ] + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # The code enclosed in this section is duplicated + # above as a performance optimization. Make sure + # changes get made in both locations. + + pslice.slice = targ + + try: + # Call the grammar rule with our special slice object + p.callable(pslice) + symstack.append(sym) + state = goto[statestack[-1]][pname] + statestack.append(state) + except SyntaxError: + # If an error was set. Enter error recovery state + lookaheadstack.append(lookahead) + symstack.pop() + statestack.pop() + state = statestack[-1] + sym.type = 'error' + lookahead = sym + errorcount = error_count + self.errorok = 0 + continue + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + if t == 0: + n = symstack[-1] + return getattr(n,"value",None) + + if t == None: + + # We have some kind of parsing error here. To handle + # this, we are going to push the current token onto + # the tokenstack and replace it with an 'error' token. + # If there are any synchronization rules, they may + # catch it. + # + # In addition to pushing the error token, we call call + # the user defined p_error() function if this is the + # first syntax error. This function is only called if + # errorcount == 0. + if errorcount == 0 or self.errorok: + errorcount = error_count + self.errorok = 0 + errtoken = lookahead + if errtoken.type == '$end': + errtoken = None # End of file! + if self.errorfunc: + global errok,token,restart + errok = self.errok # Set some special functions available in error recovery + token = get_token + restart = self.restart + if errtoken and not hasattr(errtoken,'lexer'): + errtoken.lexer = lexer + tok = self.errorfunc(errtoken) + del errok, token, restart # Delete special functions - Prodnames = { } # A dictionary mapping the names of nonterminals to a list of all - # productions of that nonterminal. + if self.errorok: + # User must have done some kind of panic + # mode recovery on their own. The + # returned token is the next lookahead + lookahead = tok + errtoken = None + continue + else: + if errtoken: + if hasattr(errtoken,"lineno"): lineno = lookahead.lineno + else: lineno = 0 + if lineno: + sys.stderr.write("yacc: Syntax error at line %d, token=%s\n" % (lineno, errtoken.type)) + else: + sys.stderr.write("yacc: Syntax error, token=%s" % errtoken.type) + else: + sys.stderr.write("yacc: Parse error in input. EOF\n") + return - Prodmap = { } # A dictionary that is only used to detect duplicate - # productions. + else: + errorcount = error_count - Terminals = { } # A dictionary mapping the names of terminal symbols to a - # list of the rules where they are used. + # case 1: the statestack only has 1 entry on it. If we're in this state, the + # entire parse has been rolled back and we're completely hosed. The token is + # discarded and we just keep going. - Nonterminals = { } # A dictionary mapping names of nonterminals to a list - # of rule numbers where they are used. + if len(statestack) <= 1 and lookahead.type != '$end': + lookahead = None + errtoken = None + state = 0 + # Nuke the pushback stack + del lookaheadstack[:] + continue - First = { } # A dictionary of precomputed FIRST(x) symbols + # case 2: the statestack has a couple of entries on it, but we're + # at the end of the file. nuke the top entry and generate an error token - Follow = { } # A dictionary of precomputed FOLLOW(x) symbols + # Start nuking entries on the stack + if lookahead.type == '$end': + # Whoa. We're really hosed here. Bail out + return - Precedence = { } # Precedence rules for each terminal. Contains tuples of the - # form ('right',level) or ('nonassoc', level) or ('left',level) + if lookahead.type != 'error': + sym = symstack[-1] + if sym.type == 'error': + # Hmmm. Error is on top of stack, we'll just nuke input + # symbol and continue + lookahead = None + continue + t = YaccSymbol() + t.type = 'error' + if hasattr(lookahead,"lineno"): + t.lineno = lookahead.lineno + t.value = lookahead + lookaheadstack.append(lookahead) + lookahead = t + else: + symstack.pop() + statestack.pop() + state = statestack[-1] # Potential bug fix - LRitems = [ ] # A list of all LR items for the grammar. These are the - # productions with the "dot" like E -> E . PLUS E + continue - Errorfunc = None # User defined error handler + # Call an error function here + raise RuntimeError("yacc: internal parser error!!!\n") - Signature = md5.new() # Digital signature of the grammar rules, precedence - # and other information. Used to determined when a - # parsing table needs to be regenerated. +# ----------------------------------------------------------------------------- +# === Grammar Representation === +# +# The following functions, classes, and variables are used to represent and +# manipulate the rules that make up a grammar. +# ----------------------------------------------------------------------------- - Requires = { } # Requires list +import re - # File objects used when creating the parser.out debugging file - global _vf, _vfc - _vf = cStringIO.StringIO() - _vfc = cStringIO.StringIO() +# regex matching identifiers +_is_identifier = re.compile(r'^[a-zA-Z0-9_-]+$') # ----------------------------------------------------------------------------- # class Production: # # This class stores the raw information about a single production or grammar rule. -# It has a few required attributes: +# A grammar rule refers to a specification such as this: # -# name - Name of the production (nonterminal) -# prod - A list of symbols making up its production +# expr : expr PLUS term +# +# Here are the basic attributes defined on all productions +# +# name - Name of the production. For example 'expr' +# prod - A list of symbols on the right side ['expr','PLUS','term'] +# prec - Production precedence level # number - Production number. +# func - Function that executes on reduce +# file - File where production function is defined +# lineno - Line number where production function is defined # -# In addition, a few additional attributes are used to help with debugging or -# optimization of table generation. +# The following attributes are defined or optional. # -# file - File where production action is defined. -# lineno - Line number where action is defined -# func - Action function -# prec - Precedence level -# lr_next - Next LR item. Example, if we are ' E -> E . PLUS E' -# then lr_next refers to 'E -> E PLUS . E' -# lr_index - LR item index (location of the ".") in the prod list. -# lookaheads - LALR lookahead symbols for this item -# len - Length of the production (number of symbols on right hand side) +# len - Length of the production (number of symbols on right hand side) +# usyms - Set of unique symbols found in the production # ----------------------------------------------------------------------------- -class Production: - def __init__(self,**kw): - for k,v in kw.items(): - setattr(self,k,v) - self.lr_index = -1 - self.lr0_added = 0 # Flag indicating whether or not added to LR0 closure - self.lr1_added = 0 # Flag indicating whether or not added to LR1 - self.usyms = [ ] - self.lookaheads = { } - self.lk_added = { } - self.setnumbers = [ ] - - def __str__(self): +class Production(object): + reduced = 0 + def __init__(self,number,name,prod,precedence=('right',0),func=None,file='',line=0): + self.name = name + self.prod = tuple(prod) + self.number = number + self.func = func + self.callable = None + self.file = file + self.line = line + self.prec = precedence + + # Internal settings used during table construction + + self.len = len(self.prod) # Length of the production + + # Create a list of unique production symbols used in the production + self.usyms = [ ] + for s in self.prod: + if s not in self.usyms: + self.usyms.append(s) + + # List of all LR items for the production + self.lr_items = [] + self.lr_next = None + + # Create a string representation if self.prod: - s = "%s -> %s" % (self.name," ".join(self.prod)) + self.str = "%s -> %s" % (self.name," ".join(self.prod)) else: - s = "%s -> <empty>" % self.name - return s + self.str = "%s -> <empty>" % self.name + + def __str__(self): + return self.str def __repr__(self): - return str(self) + return "Production("+str(self)+")" + + def __len__(self): + return len(self.prod) + + def __nonzero__(self): + return 1 - # Compute lr_items from the production + def __getitem__(self,index): + return self.prod[index] + + # Return the nth lr_item from the production (or None if at the end) def lr_item(self,n): if n > len(self.prod): return None - p = Production() - p.name = self.name - p.prod = list(self.prod) - p.number = self.number - p.lr_index = n - p.lookaheads = { } - p.setnumbers = self.setnumbers - p.prod.insert(n,".") - p.prod = tuple(p.prod) - p.len = len(p.prod) - p.usyms = self.usyms - - # Precompute list of productions immediately following + p = LRItem(self,n) + + # Precompute the list of productions immediately following. Hack. Remove later try: - p.lrafter = Prodnames[p.prod[n+1]] - except (IndexError,KeyError),e: - p.lrafter = [] + p.lr_after = Prodnames[p.prod[n+1]] + except (IndexError,KeyError): + p.lr_after = [] try: - p.lrbefore = p.prod[n-1] + p.lr_before = p.prod[n-1] except IndexError: - p.lrbefore = None + p.lr_before = None return p + + # Bind the production function name to a callable + def bind(self,pdict): + if self.func: + self.callable = pdict[self.func] + +# This class serves as a minimal standin for Production objects when +# reading table data from files. It only contains information +# actually used by the LR parsing engine, plus some additional +# debugging information. +class MiniProduction(object): + def __init__(self,str,name,len,func,file,line): + self.name = name + self.len = len + self.func = func + self.callable = None + self.file = file + self.line = line + self.str = str + def __str__(self): + return self.str + def __repr__(self): + return "MiniProduction(%s)" % self.str -class MiniProduction: - pass + # Bind the production function name to a callable + def bind(self,pdict): + if self.func: + self.callable = pdict[self.func] -# regex matching identifiers -_is_identifier = re.compile(r'^[a-zA-Z0-9_-]+$') # ----------------------------------------------------------------------------- -# add_production() +# class LRItem # -# Given an action function, this function assembles a production rule. -# The production rule is assumed to be found in the function's docstring. -# This rule has the general syntax: +# This class represents a specific stage of parsing a production rule. For +# example: # -# name1 ::= production1 -# | production2 -# | production3 -# ... -# | productionn -# name2 ::= production1 -# | production2 -# ... +# expr : expr . PLUS term +# +# In the above, the "." represents the current location of the parse. Here +# basic attributes: +# +# name - Name of the production. For example 'expr' +# prod - A list of symbols on the right side ['expr','.', 'PLUS','term'] +# number - Production number. +# +# lr_next Next LR item. Example, if we are ' expr -> expr . PLUS term' +# then lr_next refers to 'expr -> expr PLUS . term' +# lr_index - LR item index (location of the ".") in the prod list. +# lookaheads - LALR lookahead symbols for this item +# len - Length of the production (number of symbols on right hand side) +# lr_after - List of all productions that immediately follow +# lr_before - Grammar symbol immediately before # ----------------------------------------------------------------------------- -def add_production(f,file,line,prodname,syms): - - if Terminals.has_key(prodname): - sys.stderr.write("%s:%d: Illegal rule name '%s'. Already defined as a token.\n" % (file,line,prodname)) - return -1 - if prodname == 'error': - sys.stderr.write("%s:%d: Illegal rule name '%s'. error is a reserved word.\n" % (file,line,prodname)) - return -1 - - if not _is_identifier.match(prodname): - sys.stderr.write("%s:%d: Illegal rule name '%s'\n" % (file,line,prodname)) - return -1 - - for x in range(len(syms)): - s = syms[x] - if s[0] in "'\"": - try: - c = eval(s) - if (len(c) > 1): - sys.stderr.write("%s:%d: Literal token %s in rule '%s' may only be a single character\n" % (file,line,s, prodname)) - return -1 - if not Terminals.has_key(c): - Terminals[c] = [] - syms[x] = c - continue - except SyntaxError: - pass - if not _is_identifier.match(s) and s != '%prec': - sys.stderr.write("%s:%d: Illegal name '%s' in rule '%s'\n" % (file,line,s, prodname)) - return -1 - - # See if the rule is already in the rulemap - map = "%s -> %s" % (prodname,syms) - if Prodmap.has_key(map): - m = Prodmap[map] - sys.stderr.write("%s:%d: Duplicate rule %s.\n" % (file,line, m)) - sys.stderr.write("%s:%d: Previous definition at %s:%d\n" % (file,line, m.file, m.line)) - return -1 - - p = Production() - p.name = prodname - p.prod = syms - p.file = file - p.line = line - p.func = f - p.number = len(Productions) - - - Productions.append(p) - Prodmap[map] = p - if not Nonterminals.has_key(prodname): - Nonterminals[prodname] = [ ] - - # Add all terminals to Terminals - i = 0 - while i < len(p.prod): - t = p.prod[i] - if t == '%prec': - try: - precname = p.prod[i+1] - except IndexError: - sys.stderr.write("%s:%d: Syntax error. Nothing follows %%prec.\n" % (p.file,p.line)) - return -1 - - prec = Precedence.get(precname,None) - if not prec: - sys.stderr.write("%s:%d: Nothing known about the precedence of '%s'\n" % (p.file,p.line,precname)) - return -1 - else: - p.prec = prec - del p.prod[i] - del p.prod[i] - continue - - if Terminals.has_key(t): - Terminals[t].append(p.number) - # Is a terminal. We'll assign a precedence to p based on this - if not hasattr(p,"prec"): - p.prec = Precedence.get(t,('right',0)) - else: - if not Nonterminals.has_key(t): - Nonterminals[t] = [ ] - Nonterminals[t].append(p.number) - i += 1 +class LRItem(object): + def __init__(self,p,n): + self.name = p.name + self.prod = list(p.prod) + self.number = p.number + self.lr_index = n + self.lookaheads = { } + self.prod.insert(n,".") + self.prod = tuple(self.prod) + self.len = len(self.prod) + self.usyms = p.usyms - if not hasattr(p,"prec"): - p.prec = ('right',0) + def __str__(self): + if self.prod: + s = "%s -> %s" % (self.name," ".join(self.prod)) + else: + s = "%s -> <empty>" % self.name + return s - # Set final length of productions - p.len = len(p.prod) - p.prod = tuple(p.prod) + def __repr__(self): + return "LRItem("+str(self)+")" - # Calculate unique syms in the production - p.usyms = [ ] - for s in p.prod: - if s not in p.usyms: - p.usyms.append(s) +# ----------------------------------------------------------------------------- +# rightmost_terminal() +# +# Return the rightmost terminal from a list of symbols. Used in add_production() +# ----------------------------------------------------------------------------- +def rightmost_terminal(symbols, terminals): + i = len(symbols) - 1 + while i >= 0: + if symbols[i] in terminals: + return symbols[i] + i -= 1 + return None - # Add to the global productions list - try: - Prodnames[p.name].append(p) - except KeyError: - Prodnames[p.name] = [ p ] - return 0 +# ----------------------------------------------------------------------------- +# === GRAMMAR CLASS === +# +# The following class represents the contents of the specified grammar along +# with various computed properties such as first sets, follow sets, LR items, etc. +# This data is used for critical parts of the table generation process later. +# ----------------------------------------------------------------------------- -# Given a raw rule function, this function rips out its doc string -# and adds rules to the grammar +class GrammarError(YaccError): pass -def add_function(f): - line = f.func_code.co_firstlineno - file = f.func_code.co_filename - error = 0 +class Grammar(object): + def __init__(self,terminals): + self.Productions = [None] # A list of all of the productions. The first + # entry is always reserved for the purpose of + # building an augmented grammar - if isinstance(f,types.MethodType): - reqdargs = 2 - else: - reqdargs = 1 - - if f.func_code.co_argcount > reqdargs: - sys.stderr.write("%s:%d: Rule '%s' has too many arguments.\n" % (file,line,f.__name__)) - return -1 - - if f.func_code.co_argcount < reqdargs: - sys.stderr.write("%s:%d: Rule '%s' requires an argument.\n" % (file,line,f.__name__)) - return -1 - - if f.__doc__: - # Split the doc string into lines - pstrings = f.__doc__.splitlines() - lastp = None - dline = line - for ps in pstrings: - dline += 1 - p = ps.split() - if not p: continue - try: - if p[0] == '|': - # This is a continuation of a previous rule - if not lastp: - sys.stderr.write("%s:%d: Misplaced '|'.\n" % (file,dline)) - return -1 - prodname = lastp - if len(p) > 1: - syms = p[1:] - else: - syms = [ ] - else: - prodname = p[0] - lastp = prodname - assign = p[1] - if len(p) > 2: - syms = p[2:] - else: - syms = [ ] - if assign != ':' and assign != '::=': - sys.stderr.write("%s:%d: Syntax error. Expected ':'\n" % (file,dline)) - return -1 + self.Prodnames = { } # A dictionary mapping the names of nonterminals to a list of all + # productions of that nonterminal. + self.Prodmap = { } # A dictionary that is only used to detect duplicate + # productions. - e = add_production(f,file,dline,prodname,syms) - error += e + self.Terminals = { } # A dictionary mapping the names of terminal symbols to a + # list of the rules where they are used. + for term in terminals: + self.Terminals[term] = [] - except StandardError: - sys.stderr.write("%s:%d: Syntax error in rule '%s'\n" % (file,dline,ps)) - error -= 1 - else: - sys.stderr.write("%s:%d: No documentation string specified in function '%s'\n" % (file,line,f.__name__)) - return error - - -# Cycle checking code (Michael Dyck) - -def compute_reachable(): - ''' - Find each symbol that can be reached from the start symbol. - Print a warning for any nonterminals that can't be reached. - (Unused terminals have already had their warning.) - ''' - Reachable = { } - for s in Terminals.keys() + Nonterminals.keys(): - Reachable[s] = 0 - - mark_reachable_from( Productions[0].prod[0], Reachable ) - - for s in Nonterminals.keys(): - if not Reachable[s]: - sys.stderr.write("yacc: Symbol '%s' is unreachable.\n" % s) - -def mark_reachable_from(s, Reachable): - ''' - Mark all symbols that are reachable from symbol s. - ''' - if Reachable[s]: - # We've already reached symbol s. - return - Reachable[s] = 1 - for p in Prodnames.get(s,[]): - for r in p.prod: - mark_reachable_from(r, Reachable) + self.Terminals['error'] = [] -# ----------------------------------------------------------------------------- -# compute_terminates() -# -# This function looks at the various parsing rules and tries to detect -# infinite recursion cycles (grammar rules where there is no possible way -# to derive a string of only terminals). -# ----------------------------------------------------------------------------- -def compute_terminates(): - ''' - Raise an error for any symbols that don't terminate. - ''' - Terminates = {} - - # Terminals: - for t in Terminals.keys(): - Terminates[t] = 1 - - Terminates['$end'] = 1 - - # Nonterminals: - - # Initialize to false: - for n in Nonterminals.keys(): - Terminates[n] = 0 - - # Then propagate termination until no change: - while 1: - some_change = 0 - for (n,pl) in Prodnames.items(): - # Nonterminal n terminates iff any of its productions terminates. - for p in pl: - # Production p terminates iff all of its rhs symbols terminate. - for s in p.prod: - if not Terminates[s]: - # The symbol s does not terminate, - # so production p does not terminate. - p_terminates = 0 - break - else: - # didn't break from the loop, - # so every symbol s terminates - # so production p terminates. - p_terminates = 1 - - if p_terminates: - # symbol n terminates! - if not Terminates[n]: - Terminates[n] = 1 - some_change = 1 - # Don't need to consider any more productions for this n. - break - - if not some_change: - break - - some_error = 0 - for (s,terminates) in Terminates.items(): - if not terminates: - if not Prodnames.has_key(s) and not Terminals.has_key(s) and s != 'error': - # s is used-but-not-defined, and we've already warned of that, - # so it would be overkill to say that it's also non-terminating. - pass - else: - sys.stderr.write("yacc: Infinite recursion detected for symbol '%s'.\n" % s) - some_error = 1 + self.Nonterminals = { } # A dictionary mapping names of nonterminals to a list + # of rule numbers where they are used. - return some_error + self.First = { } # A dictionary of precomputed FIRST(x) symbols -# ----------------------------------------------------------------------------- -# verify_productions() -# -# This function examines all of the supplied rules to see if they seem valid. -# ----------------------------------------------------------------------------- -def verify_productions(cycle_check=1): - error = 0 - for p in Productions: - if not p: continue + self.Follow = { } # A dictionary of precomputed FOLLOW(x) symbols - for s in p.prod: - if not Prodnames.has_key(s) and not Terminals.has_key(s) and s != 'error': - sys.stderr.write("%s:%d: Symbol '%s' used, but not defined as a token or a rule.\n" % (p.file,p.line,s)) - error = 1 - continue + self.Precedence = { } # Precedence rules for each terminal. Contains tuples of the + # form ('right',level) or ('nonassoc', level) or ('left',level) - unused_tok = 0 - # Now verify all of the tokens - if yaccdebug: - _vf.write("Unused terminals:\n\n") - for s,v in Terminals.items(): - if s != 'error' and not v: - sys.stderr.write("yacc: Warning. Token '%s' defined, but not used.\n" % s) - if yaccdebug: _vf.write(" %s\n"% s) - unused_tok += 1 - - # Print out all of the productions - if yaccdebug: - _vf.write("\nGrammar\n\n") - for i in range(1,len(Productions)): - _vf.write("Rule %-5d %s\n" % (i, Productions[i])) - - unused_prod = 0 - # Verify the use of all productions - for s,v in Nonterminals.items(): - if not v: - p = Prodnames[s][0] - sys.stderr.write("%s:%d: Warning. Rule '%s' defined, but not used.\n" % (p.file,p.line, s)) - unused_prod += 1 - - - if unused_tok == 1: - sys.stderr.write("yacc: Warning. There is 1 unused token.\n") - if unused_tok > 1: - sys.stderr.write("yacc: Warning. There are %d unused tokens.\n" % unused_tok) - - if unused_prod == 1: - sys.stderr.write("yacc: Warning. There is 1 unused rule.\n") - if unused_prod > 1: - sys.stderr.write("yacc: Warning. There are %d unused rules.\n" % unused_prod) - - if yaccdebug: - _vf.write("\nTerminals, with rules where they appear\n\n") - ks = Terminals.keys() - ks.sort() - for k in ks: - _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Terminals[k]]))) - _vf.write("\nNonterminals, with rules where they appear\n\n") - ks = Nonterminals.keys() - ks.sort() - for k in ks: - _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Nonterminals[k]]))) - - if (cycle_check): - compute_reachable() - error += compute_terminates() -# error += check_cycles() - return error + self.UsedPrecedence = { } # Precedence rules that were actually used by the grammer. + # This is only used to provide error checking and to generate + # a warning about unused precedence rules. -# ----------------------------------------------------------------------------- -# build_lritems() -# -# This function walks the list of productions and builds a complete set of the -# LR items. The LR items are stored in two ways: First, they are uniquely -# numbered and placed in the list _lritems. Second, a linked list of LR items -# is built for each production. For example: -# -# E -> E PLUS E -# -# Creates the list -# -# [E -> . E PLUS E, E -> E . PLUS E, E -> E PLUS . E, E -> E PLUS E . ] -# ----------------------------------------------------------------------------- + self.Start = None # Starting symbol for the grammar -def build_lritems(): - for p in Productions: - lastlri = p - lri = p.lr_item(0) - i = 0 - while 1: - lri = p.lr_item(i) - lastlri.lr_next = lri - if not lri: break - lri.lr_num = len(LRitems) - LRitems.append(lri) - lastlri = lri - i += 1 - # In order for the rest of the parser generator to work, we need to - # guarantee that no more lritems are generated. Therefore, we nuke - # the p.lr_item method. (Only used in debugging) - # Production.lr_item = None + def __len__(self): + return len(self.Productions) + + def __getitem__(self,index): + return self.Productions[index] + + # ----------------------------------------------------------------------------- + # set_precedence() + # + # Sets the precedence for a given terminal. assoc is the associativity such as + # 'left','right', or 'nonassoc'. level is a numeric level. + # + # ----------------------------------------------------------------------------- + + def set_precedence(self,term,assoc,level): + assert self.Productions == [None],"Must call set_precedence() before add_production()" + if term in self.Precedence: + raise GrammarError("Precedence already specified for terminal '%s'" % term) + if assoc not in ['left','right','nonassoc']: + raise GrammarError("Associativity must be one of 'left','right', or 'nonassoc'") + self.Precedence[term] = (assoc,level) + + # ----------------------------------------------------------------------------- + # add_production() + # + # Given an action function, this function assembles a production rule and + # computes its precedence level. + # + # The production rule is supplied as a list of symbols. For example, + # a rule such as 'expr : expr PLUS term' has a production name of 'expr' and + # symbols ['expr','PLUS','term']. + # + # Precedence is determined by the precedence of the right-most non-terminal + # or the precedence of a terminal specified by %prec. + # + # A variety of error checks are performed to make sure production symbols + # are valid and that %prec is used correctly. + # ----------------------------------------------------------------------------- + + def add_production(self,prodname,syms,func=None,file='',line=0): + + if prodname in self.Terminals: + raise GrammarError("%s:%d: Illegal rule name '%s'. Already defined as a token" % (file,line,prodname)) + if prodname == 'error': + raise GrammarError("%s:%d: Illegal rule name '%s'. error is a reserved word" % (file,line,prodname)) + if not _is_identifier.match(prodname): + raise GrammarError("%s:%d: Illegal rule name '%s'" % (file,line,prodname)) + + # Look for literal tokens + for n,s in enumerate(syms): + if s[0] in "'\"": + try: + c = eval(s) + if (len(c) > 1): + raise GrammarError("%s:%d: Literal token %s in rule '%s' may only be a single character" % (file,line,s, prodname)) + if not c in self.Terminals: + self.Terminals[c] = [] + syms[n] = c + continue + except SyntaxError: + pass + if not _is_identifier.match(s) and s != '%prec': + raise GrammarError("%s:%d: Illegal name '%s' in rule '%s'" % (file,line,s, prodname)) + + # Determine the precedence level + if '%prec' in syms: + if syms[-1] == '%prec': + raise GrammarError("%s:%d: Syntax error. Nothing follows %%prec" % (file,line)) + if syms[-2] != '%prec': + raise GrammarError("%s:%d: Syntax error. %%prec can only appear at the end of a grammar rule" % (file,line)) + precname = syms[-1] + prodprec = self.Precedence.get(precname,None) + if not prodprec: + raise GrammarError("%s:%d: Nothing known about the precedence of '%s'" % (file,line,precname)) + else: + self.UsedPrecedence[precname] = 1 + del syms[-2:] # Drop %prec from the rule + else: + # If no %prec, precedence is determined by the rightmost terminal symbol + precname = rightmost_terminal(syms,self.Terminals) + prodprec = self.Precedence.get(precname,('right',0)) + + # See if the rule is already in the rulemap + map = "%s -> %s" % (prodname,syms) + if map in self.Prodmap: + m = self.Prodmap[map] + raise GrammarError("%s:%d: Duplicate rule %s. " % (file,line, m) + + "Previous definition at %s:%d" % (m.file, m.line)) + + # From this point on, everything is valid. Create a new Production instance + pnumber = len(self.Productions) + if not prodname in self.Nonterminals: + self.Nonterminals[prodname] = [ ] + + # Add the production number to Terminals and Nonterminals + for t in syms: + if t in self.Terminals: + self.Terminals[t].append(pnumber) + else: + if not t in self.Nonterminals: + self.Nonterminals[t] = [ ] + self.Nonterminals[t].append(pnumber) -# ----------------------------------------------------------------------------- -# add_precedence() -# -# Given a list of precedence rules, add to the precedence table. -# ----------------------------------------------------------------------------- + # Create a production and add it to the list of productions + p = Production(pnumber,prodname,syms,prodprec,func,file,line) + self.Productions.append(p) + self.Prodmap[map] = p -def add_precedence(plist): - plevel = 0 - error = 0 - for p in plist: - plevel += 1 + # Add to the global productions list try: - prec = p[0] - terms = p[1:] - if prec != 'left' and prec != 'right' and prec != 'nonassoc': - sys.stderr.write("yacc: Invalid precedence '%s'\n" % prec) - return -1 - for t in terms: - if Precedence.has_key(t): - sys.stderr.write("yacc: Precedence already specified for terminal '%s'\n" % t) - error += 1 - continue - Precedence[t] = (prec,plevel) - except: - sys.stderr.write("yacc: Invalid precedence table.\n") - error += 1 - - return error - -# ----------------------------------------------------------------------------- -# augment_grammar() -# -# Compute the augmented grammar. This is just a rule S' -> start where start -# is the starting symbol. -# ----------------------------------------------------------------------------- + self.Prodnames[prodname].append(p) + except KeyError: + self.Prodnames[prodname] = [ p ] + return 0 -def augment_grammar(start=None): - if not start: - start = Productions[1].name - Productions[0] = Production(name="S'",prod=[start],number=0,len=1,prec=('right',0),func=None) - Productions[0].usyms = [ start ] - Nonterminals[start].append(0) + # ----------------------------------------------------------------------------- + # set_start() + # + # Sets the starting symbol and creates the augmented grammar. Production + # rule 0 is S' -> start where start is the start symbol. + # ----------------------------------------------------------------------------- + + def set_start(self,start=None): + if not start: + start = self.Productions[1].name + if start not in self.Nonterminals: + raise GrammarError("start symbol %s undefined" % start) + self.Productions[0] = Production(0,"S'",[start]) + self.Nonterminals[start].append(0) + self.Start = start + + # ----------------------------------------------------------------------------- + # find_unreachable() + # + # Find all of the nonterminal symbols that can't be reached from the starting + # symbol. Returns a list of nonterminals that can't be reached. + # ----------------------------------------------------------------------------- + + def find_unreachable(self): + + # Mark all symbols that are reachable from a symbol s + def mark_reachable_from(s): + if reachable[s]: + # We've already reached symbol s. + return + reachable[s] = 1 + for p in self.Prodnames.get(s,[]): + for r in p.prod: + mark_reachable_from(r) + + reachable = { } + for s in list(self.Terminals) + list(self.Nonterminals): + reachable[s] = 0 + + mark_reachable_from( self.Productions[0].prod[0] ) + + return [s for s in list(self.Nonterminals) + if not reachable[s]] + + # ----------------------------------------------------------------------------- + # infinite_cycles() + # + # This function looks at the various parsing rules and tries to detect + # infinite recursion cycles (grammar rules where there is no possible way + # to derive a string of only terminals). + # ----------------------------------------------------------------------------- + + def infinite_cycles(self): + terminates = {} + + # Terminals: + for t in self.Terminals: + terminates[t] = 1 + + terminates['$end'] = 1 + + # Nonterminals: + + # Initialize to false: + for n in self.Nonterminals: + terminates[n] = 0 + + # Then propagate termination until no change: + while 1: + some_change = 0 + for (n,pl) in self.Prodnames.items(): + # Nonterminal n terminates iff any of its productions terminates. + for p in pl: + # Production p terminates iff all of its rhs symbols terminate. + for s in p.prod: + if not terminates[s]: + # The symbol s does not terminate, + # so production p does not terminate. + p_terminates = 0 + break + else: + # didn't break from the loop, + # so every symbol s terminates + # so production p terminates. + p_terminates = 1 + + if p_terminates: + # symbol n terminates! + if not terminates[n]: + terminates[n] = 1 + some_change = 1 + # Don't need to consider any more productions for this n. + break + if not some_change: + break -# ------------------------------------------------------------------------- -# first() -# -# Compute the value of FIRST1(beta) where beta is a tuple of symbols. -# -# During execution of compute_first1, the result may be incomplete. -# Afterward (e.g., when called from compute_follow()), it will be complete. -# ------------------------------------------------------------------------- -def first(beta): - - # We are computing First(x1,x2,x3,...,xn) - result = [ ] - for x in beta: - x_produces_empty = 0 - - # Add all the non-<empty> symbols of First[x] to the result. - for f in First[x]: - if f == '<empty>': - x_produces_empty = 1 - else: - if f not in result: result.append(f) + infinite = [] + for (s,term) in terminates.items(): + if not term: + if not s in self.Prodnames and not s in self.Terminals and s != 'error': + # s is used-but-not-defined, and we've already warned of that, + # so it would be overkill to say that it's also non-terminating. + pass + else: + infinite.append(s) - if x_produces_empty: - # We have to consider the next x in beta, - # i.e. stay in the loop. - pass - else: - # We don't have to consider any further symbols in beta. - break - else: - # There was no 'break' from the loop, - # so x_produces_empty was true for all x in beta, - # so beta produces empty as well. - result.append('<empty>') + return infinite - return result + # ----------------------------------------------------------------------------- + # undefined_symbols() + # + # Find all symbols that were used the grammar, but not defined as tokens or + # grammar rules. Returns a list of tuples (sym, prod) where sym in the symbol + # and prod is the production where the symbol was used. + # ----------------------------------------------------------------------------- + def undefined_symbols(self): + result = [] + for p in self.Productions: + if not p: continue -# FOLLOW(x) -# Given a non-terminal. This function computes the set of all symbols -# that might follow it. Dragon book, p. 189. - -def compute_follow(start=None): - # Add '$end' to the follow list of the start symbol - for k in Nonterminals.keys(): - Follow[k] = [ ] - - if not start: - start = Productions[1].name - - Follow[start] = [ '$end' ] - - while 1: - didadd = 0 - for p in Productions[1:]: - # Here is the production set - for i in range(len(p.prod)): - B = p.prod[i] - if Nonterminals.has_key(B): - # Okay. We got a non-terminal in a production - fst = first(p.prod[i+1:]) - hasempty = 0 - for f in fst: - if f != '<empty>' and f not in Follow[B]: - Follow[B].append(f) - didadd = 1 - if f == '<empty>': - hasempty = 1 - if hasempty or i == (len(p.prod)-1): - # Add elements of follow(a) to follow(b) - for f in Follow[p.name]: - if f not in Follow[B]: - Follow[B].append(f) - didadd = 1 - if not didadd: break + for s in p.prod: + if not s in self.Prodnames and not s in self.Terminals and s != 'error': + result.append((s,p)) + return result + + # ----------------------------------------------------------------------------- + # unused_terminals() + # + # Find all terminals that were defined, but not used by the grammar. Returns + # a list of all symbols. + # ----------------------------------------------------------------------------- + def unused_terminals(self): + unused_tok = [] + for s,v in self.Terminals.items(): + if s != 'error' and not v: + unused_tok.append(s) + + return unused_tok + + # ------------------------------------------------------------------------------ + # unused_rules() + # + # Find all grammar rules that were defined, but not used (maybe not reachable) + # Returns a list of productions. + # ------------------------------------------------------------------------------ + + def unused_rules(self): + unused_prod = [] + for s,v in self.Nonterminals.items(): + if not v: + p = self.Prodnames[s][0] + unused_prod.append(p) + return unused_prod + + # ----------------------------------------------------------------------------- + # unused_precedence() + # + # Returns a list of tuples (term,precedence) corresponding to precedence + # rules that were never used by the grammar. term is the name of the terminal + # on which precedence was applied and precedence is a string such as 'left' or + # 'right' corresponding to the type of precedence. + # ----------------------------------------------------------------------------- + + def unused_precedence(self): + unused = [] + for termname in self.Precedence: + if not (termname in self.Terminals or termname in self.UsedPrecedence): + unused.append((termname,self.Precedence[termname][0])) + + return unused + + # ------------------------------------------------------------------------- + # _first() + # + # Compute the value of FIRST1(beta) where beta is a tuple of symbols. + # + # During execution of compute_first1, the result may be incomplete. + # Afterward (e.g., when called from compute_follow()), it will be complete. + # ------------------------------------------------------------------------- + def _first(self,beta): + + # We are computing First(x1,x2,x3,...,xn) + result = [ ] + for x in beta: + x_produces_empty = 0 + + # Add all the non-<empty> symbols of First[x] to the result. + for f in self.First[x]: + if f == '<empty>': + x_produces_empty = 1 + else: + if f not in result: result.append(f) - if 0 and yaccdebug: - _vf.write('\nFollow:\n') - for k in Nonterminals.keys(): - _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Follow[k]]))) + if x_produces_empty: + # We have to consider the next x in beta, + # i.e. stay in the loop. + pass + else: + # We don't have to consider any further symbols in beta. + break + else: + # There was no 'break' from the loop, + # so x_produces_empty was true for all x in beta, + # so beta produces empty as well. + result.append('<empty>') -# ------------------------------------------------------------------------- -# compute_first1() -# -# Compute the value of FIRST1(X) for all symbols -# ------------------------------------------------------------------------- -def compute_first1(): - - # Terminals: - for t in Terminals.keys(): - First[t] = [t] - - First['$end'] = ['$end'] - First['#'] = ['#'] # what's this for? - - # Nonterminals: - - # Initialize to the empty set: - for n in Nonterminals.keys(): - First[n] = [] - - # Then propagate symbols until no change: - while 1: - some_change = 0 - for n in Nonterminals.keys(): - for p in Prodnames[n]: - for f in first(p.prod): - if f not in First[n]: - First[n].append( f ) - some_change = 1 - if not some_change: - break - - if 0 and yaccdebug: - _vf.write('\nFirst:\n') - for k in Nonterminals.keys(): - _vf.write("%-20s : %s\n" % - (k, " ".join([str(s) for s in First[k]]))) + return result -# ----------------------------------------------------------------------------- -# === SLR Generation === -# -# The following functions are used to construct SLR (Simple LR) parsing tables -# as described on p.221-229 of the dragon book. -# ----------------------------------------------------------------------------- + # ------------------------------------------------------------------------- + # compute_first() + # + # Compute the value of FIRST1(X) for all symbols + # ------------------------------------------------------------------------- + def compute_first(self): + if self.First: + return self.First -# Global variables for the LR parsing engine -def lr_init_vars(): - global _lr_action, _lr_goto, _lr_method - global _lr_goto_cache, _lr0_cidhash - - _lr_action = { } # Action table - _lr_goto = { } # Goto table - _lr_method = "Unknown" # LR method used - _lr_goto_cache = { } - _lr0_cidhash = { } - - -# Compute the LR(0) closure operation on I, where I is a set of LR(0) items. -# prodlist is a list of productions. - -_add_count = 0 # Counter used to detect cycles - -def lr0_closure(I): - global _add_count - - _add_count += 1 - prodlist = Productions - - # Add everything in I to J - J = I[:] - didadd = 1 - while didadd: - didadd = 0 - for j in J: - for x in j.lrafter: - if x.lr0_added == _add_count: continue - # Add B --> .G to J - J.append(x.lr_next) - x.lr0_added = _add_count - didadd = 1 - - return J - -# Compute the LR(0) goto function goto(I,X) where I is a set -# of LR(0) items and X is a grammar symbol. This function is written -# in a way that guarantees uniqueness of the generated goto sets -# (i.e. the same goto set will never be returned as two different Python -# objects). With uniqueness, we can later do fast set comparisons using -# id(obj) instead of element-wise comparison. - -def lr0_goto(I,x): - # First we look for a previously cached entry - g = _lr_goto_cache.get((id(I),x),None) - if g: return g - - # Now we generate the goto set in a way that guarantees uniqueness - # of the result - - s = _lr_goto_cache.get(x,None) - if not s: - s = { } - _lr_goto_cache[x] = s - - gs = [ ] - for p in I: - n = p.lr_next - if n and n.lrbefore == x: - s1 = s.get(id(n),None) - if not s1: - s1 = { } - s[id(n)] = s1 - gs.append(n) - s = s1 - g = s.get('$end',None) - if not g: - if gs: - g = lr0_closure(gs) - s['$end'] = g - else: - s['$end'] = gs - _lr_goto_cache[(id(I),x)] = g - return g - -_lr0_cidhash = { } - -# Compute the LR(0) sets of item function -def lr0_items(): - - C = [ lr0_closure([Productions[0].lr_next]) ] - i = 0 - for I in C: - _lr0_cidhash[id(I)] = i - i += 1 - - # Loop over the items in C and each grammar symbols - i = 0 - while i < len(C): - I = C[i] - i += 1 - - # Collect all of the symbols that could possibly be in the goto(I,X) sets - asyms = { } - for ii in I: - for s in ii.usyms: - asyms[s] = None - - for x in asyms.keys(): - g = lr0_goto(I,x) - if not g: continue - if _lr0_cidhash.has_key(id(g)): continue - _lr0_cidhash[id(g)] = len(C) - C.append(g) - - return C + # Terminals: + for t in self.Terminals: + self.First[t] = [t] -# ----------------------------------------------------------------------------- -# ==== LALR(1) Parsing ==== -# -# LALR(1) parsing is almost exactly the same as SLR except that instead of -# relying upon Follow() sets when performing reductions, a more selective -# lookahead set that incorporates the state of the LR(0) machine is utilized. -# Thus, we mainly just have to focus on calculating the lookahead sets. -# -# The method used here is due to DeRemer and Pennelo (1982). -# -# DeRemer, F. L., and T. J. Pennelo: "Efficient Computation of LALR(1) -# Lookahead Sets", ACM Transactions on Programming Languages and Systems, -# Vol. 4, No. 4, Oct. 1982, pp. 615-649 -# -# Further details can also be found in: -# -# J. Tremblay and P. Sorenson, "The Theory and Practice of Compiler Writing", -# McGraw-Hill Book Company, (1985). -# -# Note: This implementation is a complete replacement of the LALR(1) -# implementation in PLY-1.x releases. That version was based on -# a less efficient algorithm and it had bugs in its implementation. -# ----------------------------------------------------------------------------- + self.First['$end'] = ['$end'] -# ----------------------------------------------------------------------------- -# compute_nullable_nonterminals() -# -# Creates a dictionary containing all of the non-terminals that might produce -# an empty production. -# ----------------------------------------------------------------------------- + # Nonterminals: -def compute_nullable_nonterminals(): - nullable = {} - num_nullable = 0 - while 1: - for p in Productions[1:]: - if p.len == 0: - nullable[p.name] = 1 - continue - for t in p.prod: - if not nullable.has_key(t): break - else: - nullable[p.name] = 1 - if len(nullable) == num_nullable: break - num_nullable = len(nullable) - return nullable + # Initialize to the empty set: + for n in self.Nonterminals: + self.First[n] = [] -# ----------------------------------------------------------------------------- -# find_nonterminal_trans(C) -# -# Given a set of LR(0) items, this functions finds all of the non-terminal -# transitions. These are transitions in which a dot appears immediately before -# a non-terminal. Returns a list of tuples of the form (state,N) where state -# is the state number and N is the nonterminal symbol. -# -# The input C is the set of LR(0) items. -# ----------------------------------------------------------------------------- + # Then propagate symbols until no change: + while 1: + some_change = 0 + for n in self.Nonterminals: + for p in self.Prodnames[n]: + for f in self._first(p.prod): + if f not in self.First[n]: + self.First[n].append( f ) + some_change = 1 + if not some_change: + break + + return self.First + + # --------------------------------------------------------------------- + # compute_follow() + # + # Computes all of the follow sets for every non-terminal symbol. The + # follow set is the set of all symbols that might follow a given + # non-terminal. See the Dragon book, 2nd Ed. p. 189. + # --------------------------------------------------------------------- + def compute_follow(self,start=None): + # If already computed, return the result + if self.Follow: + return self.Follow + + # If first sets not computed yet, do that first. + if not self.First: + self.compute_first() + + # Add '$end' to the follow list of the start symbol + for k in self.Nonterminals: + self.Follow[k] = [ ] + + if not start: + start = self.Productions[1].name + + self.Follow[start] = [ '$end' ] -def find_nonterminal_transitions(C): - trans = [] - for state in range(len(C)): - for p in C[state]: - if p.lr_index < p.len - 1: - t = (state,p.prod[p.lr_index+1]) - if Nonterminals.has_key(t[1]): - if t not in trans: trans.append(t) - state = state + 1 - return trans + while 1: + didadd = 0 + for p in self.Productions[1:]: + # Here is the production set + for i in range(len(p.prod)): + B = p.prod[i] + if B in self.Nonterminals: + # Okay. We got a non-terminal in a production + fst = self._first(p.prod[i+1:]) + hasempty = 0 + for f in fst: + if f != '<empty>' and f not in self.Follow[B]: + self.Follow[B].append(f) + didadd = 1 + if f == '<empty>': + hasempty = 1 + if hasempty or i == (len(p.prod)-1): + # Add elements of follow(a) to follow(b) + for f in self.Follow[p.name]: + if f not in self.Follow[B]: + self.Follow[B].append(f) + didadd = 1 + if not didadd: break + return self.Follow + + + # ----------------------------------------------------------------------------- + # build_lritems() + # + # This function walks the list of productions and builds a complete set of the + # LR items. The LR items are stored in two ways: First, they are uniquely + # numbered and placed in the list _lritems. Second, a linked list of LR items + # is built for each production. For example: + # + # E -> E PLUS E + # + # Creates the list + # + # [E -> . E PLUS E, E -> E . PLUS E, E -> E PLUS . E, E -> E PLUS E . ] + # ----------------------------------------------------------------------------- + + def build_lritems(self): + for p in self.Productions: + lastlri = p + i = 0 + lr_items = [] + while 1: + if i > len(p): + lri = None + else: + lri = LRItem(p,i) + # Precompute the list of productions immediately following + try: + lri.lr_after = self.Prodnames[lri.prod[i+1]] + except (IndexError,KeyError): + lri.lr_after = [] + try: + lri.lr_before = lri.prod[i-1] + except IndexError: + lri.lr_before = None + + lastlri.lr_next = lri + if not lri: break + lr_items.append(lri) + lastlri = lri + i += 1 + p.lr_items = lr_items # ----------------------------------------------------------------------------- -# dr_relation() -# -# Computes the DR(p,A) relationships for non-terminal transitions. The input -# is a tuple (state,N) where state is a number and N is a nonterminal symbol. +# == Class LRTable == # -# Returns a list of terminals. +# This basic class represents a basic table of LR parsing information. +# Methods for generating the tables are not defined here. They are defined +# in the derived class LRGeneratedTable. # ----------------------------------------------------------------------------- -def dr_relation(C,trans,nullable): - dr_set = { } - state,N = trans - terms = [] +class VersionError(YaccError): pass - g = lr0_goto(C[state],N) - for p in g: - if p.lr_index < p.len - 1: - a = p.prod[p.lr_index+1] - if Terminals.has_key(a): - if a not in terms: terms.append(a) +class LRTable(object): + def __init__(self): + self.lr_action = None + self.lr_goto = None + self.lr_productions = None + self.lr_method = None - # This extra bit is to handle the start state - if state == 0 and N == Productions[0].prod[0]: - terms.append('$end') + def read_table(self,module): + if isinstance(module,types.ModuleType): + parsetab = module + else: + if sys.version_info[0] < 3: + exec("import %s as parsetab" % module) + else: + env = { } + exec("import %s as parsetab" % module, env, env) + parsetab = env['parsetab'] - return terms + if parsetab._tabversion != __tabversion__: + raise VersionError("yacc table file version is out of date") -# ----------------------------------------------------------------------------- -# reads_relation() -# -# Computes the READS() relation (p,A) READS (t,C). -# ----------------------------------------------------------------------------- - -def reads_relation(C, trans, empty): - # Look for empty transitions - rel = [] - state, N = trans + self.lr_action = parsetab._lr_action + self.lr_goto = parsetab._lr_goto - g = lr0_goto(C[state],N) - j = _lr0_cidhash.get(id(g),-1) - for p in g: - if p.lr_index < p.len - 1: - a = p.prod[p.lr_index + 1] - if empty.has_key(a): - rel.append((j,a)) + self.lr_productions = [] + for p in parsetab._lr_productions: + self.lr_productions.append(MiniProduction(*p)) - return rel + self.lr_method = parsetab._lr_method + return parsetab._lr_signature + def read_pickle(self,filename): + try: + import cPickle as pickle + except ImportError: + import pickle + + in_f = open(filename,"rb") + + tabversion = pickle.load(in_f) + if tabversion != __tabversion__: + raise VersionError("yacc table file version is out of date") + self.lr_method = pickle.load(in_f) + signature = pickle.load(in_f) + self.lr_action = pickle.load(in_f) + self.lr_goto = pickle.load(in_f) + productions = pickle.load(in_f) + + self.lr_productions = [] + for p in productions: + self.lr_productions.append(MiniProduction(*p)) + + in_f.close() + return signature + + # Bind all production function names to callable objects in pdict + def bind_callables(self,pdict): + for p in self.lr_productions: + p.bind(pdict) + # ----------------------------------------------------------------------------- -# compute_lookback_includes() -# -# Determines the lookback and includes relations -# -# LOOKBACK: -# -# This relation is determined by running the LR(0) state machine forward. -# For example, starting with a production "N : . A B C", we run it forward -# to obtain "N : A B C ." We then build a relationship between this final -# state and the starting state. These relationships are stored in a dictionary -# lookdict. -# -# INCLUDES: -# -# Computes the INCLUDE() relation (p,A) INCLUDES (p',B). -# -# This relation is used to determine non-terminal transitions that occur -# inside of other non-terminal transition states. (p,A) INCLUDES (p', B) -# if the following holds: -# -# B -> LAT, where T -> epsilon and p' -L-> p -# -# L is essentially a prefix (which may be empty), T is a suffix that must be -# able to derive an empty string. State p' must lead to state p with the string L. +# === LR Generator === # +# The following classes and functions are used to generate LR parsing tables on +# a grammar. # ----------------------------------------------------------------------------- -def compute_lookback_includes(C,trans,nullable): - - lookdict = {} # Dictionary of lookback relations - includedict = {} # Dictionary of include relations - - # Make a dictionary of non-terminal transitions - dtrans = {} - for t in trans: - dtrans[t] = 1 - - # Loop over all transitions and compute lookbacks and includes - for state,N in trans: - lookb = [] - includes = [] - for p in C[state]: - if p.name != N: continue - - # Okay, we have a name match. We now follow the production all the way - # through the state machine until we get the . on the right hand side - - lr_index = p.lr_index - j = state - while lr_index < p.len - 1: - lr_index = lr_index + 1 - t = p.prod[lr_index] - - # Check to see if this symbol and state are a non-terminal transition - if dtrans.has_key((j,t)): - # Yes. Okay, there is some chance that this is an includes relation - # the only way to know for certain is whether the rest of the - # production derives empty - - li = lr_index + 1 - while li < p.len: - if Terminals.has_key(p.prod[li]): break # No forget it - if not nullable.has_key(p.prod[li]): break - li = li + 1 - else: - # Appears to be a relation between (j,t) and (state,N) - includes.append((j,t)) - - g = lr0_goto(C[j],t) # Go to next set - j = _lr0_cidhash.get(id(g),-1) # Go to next state - - # When we get here, j is the final state, now we have to locate the production - for r in C[j]: - if r.name != p.name: continue - if r.len != p.len: continue - i = 0 - # This look is comparing a production ". A B C" with "A B C ." - while i < r.lr_index: - if r.prod[i] != p.prod[i+1]: break - i = i + 1 - else: - lookb.append((j,r)) - for i in includes: - if not includedict.has_key(i): includedict[i] = [] - includedict[i].append((state,N)) - lookdict[(state,N)] = lookb - - return lookdict,includedict - # ----------------------------------------------------------------------------- # digraph() # traverse() @@ -1525,700 +1919,1358 @@ def traverse(x,N,stack,F,X,R,FP): for a in F.get(y,[]): if a not in F[x]: F[x].append(a) if N[x] == d: - N[stack[-1]] = sys.maxint + N[stack[-1]] = MAXINT F[stack[-1]] = F[x] element = stack.pop() while element != x: - N[stack[-1]] = sys.maxint + N[stack[-1]] = MAXINT F[stack[-1]] = F[x] element = stack.pop() -# ----------------------------------------------------------------------------- -# compute_read_sets() -# -# Given a set of LR(0) items, this function computes the read sets. -# -# Inputs: C = Set of LR(0) items -# ntrans = Set of nonterminal transitions -# nullable = Set of empty transitions -# -# Returns a set containing the read sets -# ----------------------------------------------------------------------------- - -def compute_read_sets(C, ntrans, nullable): - FP = lambda x: dr_relation(C,x,nullable) - R = lambda x: reads_relation(C,x,nullable) - F = digraph(ntrans,R,FP) - return F - -# ----------------------------------------------------------------------------- -# compute_follow_sets() -# -# Given a set of LR(0) items, a set of non-terminal transitions, a readset, -# and an include set, this function computes the follow sets -# -# Follow(p,A) = Read(p,A) U U {Follow(p',B) | (p,A) INCLUDES (p',B)} -# -# Inputs: -# ntrans = Set of nonterminal transitions -# readsets = Readset (previously computed) -# inclsets = Include sets (previously computed) -# -# Returns a set containing the follow sets -# ----------------------------------------------------------------------------- - -def compute_follow_sets(ntrans,readsets,inclsets): - FP = lambda x: readsets[x] - R = lambda x: inclsets.get(x,[]) - F = digraph(ntrans,R,FP) - return F +class LALRError(YaccError): pass # ----------------------------------------------------------------------------- -# add_lookaheads() -# -# Attaches the lookahead symbols to grammar rules. +# == LRGeneratedTable == # -# Inputs: lookbacks - Set of lookback relations -# followset - Computed follow set -# -# This function directly attaches the lookaheads to productions contained -# in the lookbacks set +# This class implements the LR table generation algorithm. There are no +# public methods except for write() # ----------------------------------------------------------------------------- -def add_lookaheads(lookbacks,followset): - for trans,lb in lookbacks.items(): - # Loop over productions in lookback - for state,p in lb: - if not p.lookaheads.has_key(state): - p.lookaheads[state] = [] - f = followset.get(trans,[]) - for a in f: - if a not in p.lookaheads[state]: p.lookaheads[state].append(a) - -# ----------------------------------------------------------------------------- -# add_lalr_lookaheads() -# -# This function does all of the work of adding lookahead information for use -# with LALR parsing -# ----------------------------------------------------------------------------- - -def add_lalr_lookaheads(C): - # Determine all of the nullable nonterminals - nullable = compute_nullable_nonterminals() - - # Find all non-terminal transitions - trans = find_nonterminal_transitions(C) - - # Compute read sets - readsets = compute_read_sets(C,trans,nullable) - - # Compute lookback/includes relations - lookd, included = compute_lookback_includes(C,trans,nullable) - - # Compute LALR FOLLOW sets - followsets = compute_follow_sets(trans,readsets,included) - - # Add all of the lookaheads - add_lookaheads(lookd,followsets) - -# ----------------------------------------------------------------------------- -# lr_parse_table() -# -# This function constructs the parse tables for SLR or LALR -# ----------------------------------------------------------------------------- -def lr_parse_table(method): - global _lr_method - goto = _lr_goto # Goto array - action = _lr_action # Action array - actionp = { } # Action production array (temporary) - - _lr_method = method - - n_srconflict = 0 - n_rrconflict = 0 - - if yaccdebug: - sys.stderr.write("yacc: Generating %s parsing table...\n" % method) - _vf.write("\n\nParsing method: %s\n\n" % method) - - # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items - # This determines the number of states +class LRGeneratedTable(LRTable): + def __init__(self,grammar,method='LALR',log=None): + if method not in ['SLR','LALR']: + raise LALRError("Unsupported method %s" % method) + + self.grammar = grammar + self.lr_method = method + + # Set up the logger + if not log: + log = NullLogger() + self.log = log + + # Internal attributes + self.lr_action = {} # Action table + self.lr_goto = {} # Goto table + self.lr_productions = grammar.Productions # Copy of grammar Production array + self.lr_goto_cache = {} # Cache of computed gotos + self.lr0_cidhash = {} # Cache of closures + + self._add_count = 0 # Internal counter used to detect cycles + + # Diagonistic information filled in by the table generator + self.sr_conflict = 0 + self.rr_conflict = 0 + self.conflicts = [] # List of conflicts + + self.sr_conflicts = [] + self.rr_conflicts = [] + + # Build the tables + self.grammar.build_lritems() + self.grammar.compute_first() + self.grammar.compute_follow() + self.lr_parse_table() + + # Compute the LR(0) closure operation on I, where I is a set of LR(0) items. + + def lr0_closure(self,I): + self._add_count += 1 + + # Add everything in I to J + J = I[:] + didadd = 1 + while didadd: + didadd = 0 + for j in J: + for x in j.lr_after: + if getattr(x,"lr0_added",0) == self._add_count: continue + # Add B --> .G to J + J.append(x.lr_next) + x.lr0_added = self._add_count + didadd = 1 + + return J + + # Compute the LR(0) goto function goto(I,X) where I is a set + # of LR(0) items and X is a grammar symbol. This function is written + # in a way that guarantees uniqueness of the generated goto sets + # (i.e. the same goto set will never be returned as two different Python + # objects). With uniqueness, we can later do fast set comparisons using + # id(obj) instead of element-wise comparison. + + def lr0_goto(self,I,x): + # First we look for a previously cached entry + g = self.lr_goto_cache.get((id(I),x),None) + if g: return g + + # Now we generate the goto set in a way that guarantees uniqueness + # of the result + + s = self.lr_goto_cache.get(x,None) + if not s: + s = { } + self.lr_goto_cache[x] = s + + gs = [ ] + for p in I: + n = p.lr_next + if n and n.lr_before == x: + s1 = s.get(id(n),None) + if not s1: + s1 = { } + s[id(n)] = s1 + gs.append(n) + s = s1 + g = s.get('$end',None) + if not g: + if gs: + g = self.lr0_closure(gs) + s['$end'] = g + else: + s['$end'] = gs + self.lr_goto_cache[(id(I),x)] = g + return g - C = lr0_items() + # Compute the LR(0) sets of item function + def lr0_items(self): - if method == 'LALR': - add_lalr_lookaheads(C) + C = [ self.lr0_closure([self.grammar.Productions[0].lr_next]) ] + i = 0 + for I in C: + self.lr0_cidhash[id(I)] = i + i += 1 + # Loop over the items in C and each grammar symbols + i = 0 + while i < len(C): + I = C[i] + i += 1 - # Build the parser table, state by state - st = 0 - for I in C: - # Loop over each production in I - actlist = [ ] # List of actions - st_action = { } - st_actionp = { } - st_goto = { } - if yaccdebug: - _vf.write("\nstate %d\n\n" % st) + # Collect all of the symbols that could possibly be in the goto(I,X) sets + asyms = { } + for ii in I: + for s in ii.usyms: + asyms[s] = None + + for x in asyms: + g = self.lr0_goto(I,x) + if not g: continue + if id(g) in self.lr0_cidhash: continue + self.lr0_cidhash[id(g)] = len(C) + C.append(g) + + return C + + # ----------------------------------------------------------------------------- + # ==== LALR(1) Parsing ==== + # + # LALR(1) parsing is almost exactly the same as SLR except that instead of + # relying upon Follow() sets when performing reductions, a more selective + # lookahead set that incorporates the state of the LR(0) machine is utilized. + # Thus, we mainly just have to focus on calculating the lookahead sets. + # + # The method used here is due to DeRemer and Pennelo (1982). + # + # DeRemer, F. L., and T. J. Pennelo: "Efficient Computation of LALR(1) + # Lookahead Sets", ACM Transactions on Programming Languages and Systems, + # Vol. 4, No. 4, Oct. 1982, pp. 615-649 + # + # Further details can also be found in: + # + # J. Tremblay and P. Sorenson, "The Theory and Practice of Compiler Writing", + # McGraw-Hill Book Company, (1985). + # + # ----------------------------------------------------------------------------- + + # ----------------------------------------------------------------------------- + # compute_nullable_nonterminals() + # + # Creates a dictionary containing all of the non-terminals that might produce + # an empty production. + # ----------------------------------------------------------------------------- + + def compute_nullable_nonterminals(self): + nullable = {} + num_nullable = 0 + while 1: + for p in self.grammar.Productions[1:]: + if p.len == 0: + nullable[p.name] = 1 + continue + for t in p.prod: + if not t in nullable: break + else: + nullable[p.name] = 1 + if len(nullable) == num_nullable: break + num_nullable = len(nullable) + return nullable + + # ----------------------------------------------------------------------------- + # find_nonterminal_trans(C) + # + # Given a set of LR(0) items, this functions finds all of the non-terminal + # transitions. These are transitions in which a dot appears immediately before + # a non-terminal. Returns a list of tuples of the form (state,N) where state + # is the state number and N is the nonterminal symbol. + # + # The input C is the set of LR(0) items. + # ----------------------------------------------------------------------------- + + def find_nonterminal_transitions(self,C): + trans = [] + for state in range(len(C)): + for p in C[state]: + if p.lr_index < p.len - 1: + t = (state,p.prod[p.lr_index+1]) + if t[1] in self.grammar.Nonterminals: + if t not in trans: trans.append(t) + state = state + 1 + return trans + + # ----------------------------------------------------------------------------- + # dr_relation() + # + # Computes the DR(p,A) relationships for non-terminal transitions. The input + # is a tuple (state,N) where state is a number and N is a nonterminal symbol. + # + # Returns a list of terminals. + # ----------------------------------------------------------------------------- + + def dr_relation(self,C,trans,nullable): + dr_set = { } + state,N = trans + terms = [] + + g = self.lr0_goto(C[state],N) + for p in g: + if p.lr_index < p.len - 1: + a = p.prod[p.lr_index+1] + if a in self.grammar.Terminals: + if a not in terms: terms.append(a) + + # This extra bit is to handle the start state + if state == 0 and N == self.grammar.Productions[0].prod[0]: + terms.append('$end') + + return terms + + # ----------------------------------------------------------------------------- + # reads_relation() + # + # Computes the READS() relation (p,A) READS (t,C). + # ----------------------------------------------------------------------------- + + def reads_relation(self,C, trans, empty): + # Look for empty transitions + rel = [] + state, N = trans + + g = self.lr0_goto(C[state],N) + j = self.lr0_cidhash.get(id(g),-1) + for p in g: + if p.lr_index < p.len - 1: + a = p.prod[p.lr_index + 1] + if a in empty: + rel.append((j,a)) + + return rel + + # ----------------------------------------------------------------------------- + # compute_lookback_includes() + # + # Determines the lookback and includes relations + # + # LOOKBACK: + # + # This relation is determined by running the LR(0) state machine forward. + # For example, starting with a production "N : . A B C", we run it forward + # to obtain "N : A B C ." We then build a relationship between this final + # state and the starting state. These relationships are stored in a dictionary + # lookdict. + # + # INCLUDES: + # + # Computes the INCLUDE() relation (p,A) INCLUDES (p',B). + # + # This relation is used to determine non-terminal transitions that occur + # inside of other non-terminal transition states. (p,A) INCLUDES (p', B) + # if the following holds: + # + # B -> LAT, where T -> epsilon and p' -L-> p + # + # L is essentially a prefix (which may be empty), T is a suffix that must be + # able to derive an empty string. State p' must lead to state p with the string L. + # + # ----------------------------------------------------------------------------- + + def compute_lookback_includes(self,C,trans,nullable): + + lookdict = {} # Dictionary of lookback relations + includedict = {} # Dictionary of include relations + + # Make a dictionary of non-terminal transitions + dtrans = {} + for t in trans: + dtrans[t] = 1 + + # Loop over all transitions and compute lookbacks and includes + for state,N in trans: + lookb = [] + includes = [] + for p in C[state]: + if p.name != N: continue + + # Okay, we have a name match. We now follow the production all the way + # through the state machine until we get the . on the right hand side + + lr_index = p.lr_index + j = state + while lr_index < p.len - 1: + lr_index = lr_index + 1 + t = p.prod[lr_index] + + # Check to see if this symbol and state are a non-terminal transition + if (j,t) in dtrans: + # Yes. Okay, there is some chance that this is an includes relation + # the only way to know for certain is whether the rest of the + # production derives empty + + li = lr_index + 1 + while li < p.len: + if p.prod[li] in self.grammar.Terminals: break # No forget it + if not p.prod[li] in nullable: break + li = li + 1 + else: + # Appears to be a relation between (j,t) and (state,N) + includes.append((j,t)) + + g = self.lr0_goto(C[j],t) # Go to next set + j = self.lr0_cidhash.get(id(g),-1) # Go to next state + + # When we get here, j is the final state, now we have to locate the production + for r in C[j]: + if r.name != p.name: continue + if r.len != p.len: continue + i = 0 + # This look is comparing a production ". A B C" with "A B C ." + while i < r.lr_index: + if r.prod[i] != p.prod[i+1]: break + i = i + 1 + else: + lookb.append((j,r)) + for i in includes: + if not i in includedict: includedict[i] = [] + includedict[i].append((state,N)) + lookdict[(state,N)] = lookb + + return lookdict,includedict + + # ----------------------------------------------------------------------------- + # compute_read_sets() + # + # Given a set of LR(0) items, this function computes the read sets. + # + # Inputs: C = Set of LR(0) items + # ntrans = Set of nonterminal transitions + # nullable = Set of empty transitions + # + # Returns a set containing the read sets + # ----------------------------------------------------------------------------- + + def compute_read_sets(self,C, ntrans, nullable): + FP = lambda x: self.dr_relation(C,x,nullable) + R = lambda x: self.reads_relation(C,x,nullable) + F = digraph(ntrans,R,FP) + return F + + # ----------------------------------------------------------------------------- + # compute_follow_sets() + # + # Given a set of LR(0) items, a set of non-terminal transitions, a readset, + # and an include set, this function computes the follow sets + # + # Follow(p,A) = Read(p,A) U U {Follow(p',B) | (p,A) INCLUDES (p',B)} + # + # Inputs: + # ntrans = Set of nonterminal transitions + # readsets = Readset (previously computed) + # inclsets = Include sets (previously computed) + # + # Returns a set containing the follow sets + # ----------------------------------------------------------------------------- + + def compute_follow_sets(self,ntrans,readsets,inclsets): + FP = lambda x: readsets[x] + R = lambda x: inclsets.get(x,[]) + F = digraph(ntrans,R,FP) + return F + + # ----------------------------------------------------------------------------- + # add_lookaheads() + # + # Attaches the lookahead symbols to grammar rules. + # + # Inputs: lookbacks - Set of lookback relations + # followset - Computed follow set + # + # This function directly attaches the lookaheads to productions contained + # in the lookbacks set + # ----------------------------------------------------------------------------- + + def add_lookaheads(self,lookbacks,followset): + for trans,lb in lookbacks.items(): + # Loop over productions in lookback + for state,p in lb: + if not state in p.lookaheads: + p.lookaheads[state] = [] + f = followset.get(trans,[]) + for a in f: + if a not in p.lookaheads[state]: p.lookaheads[state].append(a) + + # ----------------------------------------------------------------------------- + # add_lalr_lookaheads() + # + # This function does all of the work of adding lookahead information for use + # with LALR parsing + # ----------------------------------------------------------------------------- + + def add_lalr_lookaheads(self,C): + # Determine all of the nullable nonterminals + nullable = self.compute_nullable_nonterminals() + + # Find all non-terminal transitions + trans = self.find_nonterminal_transitions(C) + + # Compute read sets + readsets = self.compute_read_sets(C,trans,nullable) + + # Compute lookback/includes relations + lookd, included = self.compute_lookback_includes(C,trans,nullable) + + # Compute LALR FOLLOW sets + followsets = self.compute_follow_sets(trans,readsets,included) + + # Add all of the lookaheads + self.add_lookaheads(lookd,followsets) + + # ----------------------------------------------------------------------------- + # lr_parse_table() + # + # This function constructs the parse tables for SLR or LALR + # ----------------------------------------------------------------------------- + def lr_parse_table(self): + Productions = self.grammar.Productions + Precedence = self.grammar.Precedence + goto = self.lr_goto # Goto array + action = self.lr_action # Action array + log = self.log # Logger for output + + actionp = { } # Action production array (temporary) + + log.info("Parsing method: %s", self.lr_method) + + # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items + # This determines the number of states + + C = self.lr0_items() + + if self.lr_method == 'LALR': + self.add_lalr_lookaheads(C) + + # Build the parser table, state by state + st = 0 + for I in C: + # Loop over each production in I + actlist = [ ] # List of actions + st_action = { } + st_actionp = { } + st_goto = { } + log.info("") + log.info("state %d", st) + log.info("") for p in I: - _vf.write(" (%d) %s\n" % (p.number, str(p))) - _vf.write("\n") + log.info(" (%d) %s", p.number, str(p)) + log.info("") - for p in I: - try: - if p.len == p.lr_index + 1: - if p.name == "S'": - # Start symbol. Accept! - st_action["$end"] = 0 - st_actionp["$end"] = p - else: - # We are at the end of a production. Reduce! - if method == 'LALR': - laheads = p.lookaheads[st] + for p in I: + if p.len == p.lr_index + 1: + if p.name == "S'": + # Start symbol. Accept! + st_action["$end"] = 0 + st_actionp["$end"] = p else: - laheads = Follow[p.name] - for a in laheads: - actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p))) - r = st_action.get(a,None) - if r is not None: - # Whoa. Have a shift/reduce or reduce/reduce conflict - if r > 0: - # Need to decide on shift or reduce here - # By default we favor shifting. Need to add - # some precedence rules here. - sprec,slevel = Productions[st_actionp[a].number].prec - rprec,rlevel = Precedence.get(a,('right',0)) - if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): - # We really need to reduce here. - st_action[a] = -p.number - st_actionp[a] = p - if not slevel and not rlevel: - _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) - n_srconflict += 1 - elif (slevel == rlevel) and (rprec == 'nonassoc'): - st_action[a] = None - else: - # Hmmm. Guess we'll keep the shift - if not rlevel: - _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) - n_srconflict +=1 - elif r < 0: - # Reduce/reduce conflict. In this case, we favor the rule - # that was defined first in the grammar file - oldp = Productions[-r] - pp = Productions[p.number] - if oldp.line > pp.line: - st_action[a] = -p.number - st_actionp[a] = p - # sys.stderr.write("Reduce/reduce conflict in state %d\n" % st) - n_rrconflict += 1 - _vfc.write("reduce/reduce conflict in state %d resolved using rule %d (%s).\n" % (st, st_actionp[a].number, st_actionp[a])) - _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d (%s).\n" % (a,st_actionp[a].number, st_actionp[a])) - else: - sys.stderr.write("Unknown conflict in state %d\n" % st) + # We are at the end of a production. Reduce! + if self.lr_method == 'LALR': + laheads = p.lookaheads[st] else: - st_action[a] = -p.number - st_actionp[a] = p - else: - i = p.lr_index - a = p.prod[i+1] # Get symbol right after the "." - if Terminals.has_key(a): - g = lr0_goto(I,a) - j = _lr0_cidhash.get(id(g),-1) - if j >= 0: - # We are in a shift state - actlist.append((a,p,"shift and go to state %d" % j)) - r = st_action.get(a,None) - if r is not None: - # Whoa have a shift/reduce or shift/shift conflict - if r > 0: - if r != j: - sys.stderr.write("Shift/shift conflict in state %d\n" % st) - elif r < 0: - # Do a precedence check. - # - if precedence of reduce rule is higher, we reduce. - # - if precedence of reduce is same and left assoc, we reduce. - # - otherwise we shift - rprec,rlevel = Productions[st_actionp[a].number].prec - sprec,slevel = Precedence.get(a,('right',0)) - if (slevel > rlevel) or ((slevel == rlevel) and (rprec != 'left')): - # We decide to shift here... highest precedence to shift - st_action[a] = j - st_actionp[a] = p - if not rlevel: - n_srconflict += 1 - _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) - elif (slevel == rlevel) and (rprec == 'nonassoc'): - st_action[a] = None + laheads = self.grammar.Follow[p.name] + for a in laheads: + actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p))) + r = st_action.get(a,None) + if r is not None: + # Whoa. Have a shift/reduce or reduce/reduce conflict + if r > 0: + # Need to decide on shift or reduce here + # By default we favor shifting. Need to add + # some precedence rules here. + sprec,slevel = Productions[st_actionp[a].number].prec + rprec,rlevel = Precedence.get(a,('right',0)) + if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): + # We really need to reduce here. + st_action[a] = -p.number + st_actionp[a] = p + if not slevel and not rlevel: + log.info(" ! shift/reduce conflict for %s resolved as reduce",a) + self.sr_conflicts.append((st,a,'reduce')) + Productions[p.number].reduced += 1 + elif (slevel == rlevel) and (rprec == 'nonassoc'): + st_action[a] = None + else: + # Hmmm. Guess we'll keep the shift + if not rlevel: + log.info(" ! shift/reduce conflict for %s resolved as shift",a) + self.sr_conflicts.append((st,a,'shift')) + elif r < 0: + # Reduce/reduce conflict. In this case, we favor the rule + # that was defined first in the grammar file + oldp = Productions[-r] + pp = Productions[p.number] + if oldp.line > pp.line: + st_action[a] = -p.number + st_actionp[a] = p + chosenp,rejectp = pp,oldp + Productions[p.number].reduced += 1 + Productions[oldp.number].reduced -= 1 + else: + chosenp,rejectp = oldp,pp + self.rr_conflicts.append((st,chosenp,rejectp)) + log.info(" ! reduce/reduce conflict for %s resolved using rule %d (%s)", a,st_actionp[a].number, st_actionp[a]) else: - # Hmmm. Guess we'll keep the reduce - if not slevel and not rlevel: - n_srconflict +=1 - _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) + raise LALRError("Unknown conflict in state %d" % st) + else: + st_action[a] = -p.number + st_actionp[a] = p + Productions[p.number].reduced += 1 + else: + i = p.lr_index + a = p.prod[i+1] # Get symbol right after the "." + if a in self.grammar.Terminals: + g = self.lr0_goto(I,a) + j = self.lr0_cidhash.get(id(g),-1) + if j >= 0: + # We are in a shift state + actlist.append((a,p,"shift and go to state %d" % j)) + r = st_action.get(a,None) + if r is not None: + # Whoa have a shift/reduce or shift/shift conflict + if r > 0: + if r != j: + raise LALRError("Shift/shift conflict in state %d" % st) + elif r < 0: + # Do a precedence check. + # - if precedence of reduce rule is higher, we reduce. + # - if precedence of reduce is same and left assoc, we reduce. + # - otherwise we shift + rprec,rlevel = Productions[st_actionp[a].number].prec + sprec,slevel = Precedence.get(a,('right',0)) + if (slevel > rlevel) or ((slevel == rlevel) and (rprec == 'right')): + # We decide to shift here... highest precedence to shift + Productions[st_actionp[a].number].reduced -= 1 + st_action[a] = j + st_actionp[a] = p + if not rlevel: + log.info(" ! shift/reduce conflict for %s resolved as shift",a) + self.sr_conflicts.append((st,a,'shift')) + elif (slevel == rlevel) and (rprec == 'nonassoc'): + st_action[a] = None + else: + # Hmmm. Guess we'll keep the reduce + if not slevel and not rlevel: + log.info(" ! shift/reduce conflict for %s resolved as reduce",a) + self.sr_conflicts.append((st,a,'reduce')) + else: + raise LALRError("Unknown conflict in state %d" % st) else: - sys.stderr.write("Unknown conflict in state %d\n" % st) - else: - st_action[a] = j - st_actionp[a] = p - - except StandardError,e: - print sys.exc_info() - raise YaccError, "Hosed in lr_parse_table" - - # Print the actions associated with each terminal - if yaccdebug: - _actprint = { } - for a,p,m in actlist: - if st_action.has_key(a): - if p is st_actionp[a]: - _vf.write(" %-15s %s\n" % (a,m)) - _actprint[(a,m)] = 1 - _vf.write("\n") - for a,p,m in actlist: - if st_action.has_key(a): - if p is not st_actionp[a]: - if not _actprint.has_key((a,m)): - _vf.write(" ! %-15s [ %s ]\n" % (a,m)) + st_action[a] = j + st_actionp[a] = p + + # Print the actions associated with each terminal + _actprint = { } + for a,p,m in actlist: + if a in st_action: + if p is st_actionp[a]: + log.info(" %-15s %s",a,m) _actprint[(a,m)] = 1 + log.info("") + # Print the actions that were not used. (debugging) + not_used = 0 + for a,p,m in actlist: + if a in st_action: + if p is not st_actionp[a]: + if not (a,m) in _actprint: + log.debug(" ! %-15s [ %s ]",a,m) + not_used = 1 + _actprint[(a,m)] = 1 + if not_used: + log.debug("") + + # Construct the goto table for this state + + nkeys = { } + for ii in I: + for s in ii.usyms: + if s in self.grammar.Nonterminals: + nkeys[s] = None + for n in nkeys: + g = self.lr0_goto(I,n) + j = self.lr0_cidhash.get(id(g),-1) + if j >= 0: + st_goto[n] = j + log.info(" %-30s shift and go to state %d",n,j) + + action[st] = st_action + actionp[st] = st_actionp + goto[st] = st_goto + st += 1 + + + # ----------------------------------------------------------------------------- + # write() + # + # This function writes the LR parsing tables to a file + # ----------------------------------------------------------------------------- + + def write_table(self,modulename,outputdir='',signature=""): + basemodulename = modulename.split(".")[-1] + filename = os.path.join(outputdir,basemodulename) + ".py" + try: + f = open(filename,"w") - # Construct the goto table for this state - if yaccdebug: - _vf.write("\n") - nkeys = { } - for ii in I: - for s in ii.usyms: - if Nonterminals.has_key(s): - nkeys[s] = None - for n in nkeys.keys(): - g = lr0_goto(I,n) - j = _lr0_cidhash.get(id(g),-1) - if j >= 0: - st_goto[n] = j - if yaccdebug: - _vf.write(" %-30s shift and go to state %d\n" % (n,j)) - - action[st] = st_action - actionp[st] = st_actionp - goto[st] = st_goto - - st += 1 - - if yaccdebug: - if n_srconflict == 1: - sys.stderr.write("yacc: %d shift/reduce conflict\n" % n_srconflict) - if n_srconflict > 1: - sys.stderr.write("yacc: %d shift/reduce conflicts\n" % n_srconflict) - if n_rrconflict == 1: - sys.stderr.write("yacc: %d reduce/reduce conflict\n" % n_rrconflict) - if n_rrconflict > 1: - sys.stderr.write("yacc: %d reduce/reduce conflicts\n" % n_rrconflict) - -# ----------------------------------------------------------------------------- -# ==== LR Utility functions ==== -# ----------------------------------------------------------------------------- - -# ----------------------------------------------------------------------------- -# _lr_write_tables() -# -# This function writes the LR parsing tables to a file -# ----------------------------------------------------------------------------- - -def lr_write_tables(modulename=tab_module,outputdir=''): - filename = os.path.join(outputdir,modulename) + ".py" - try: - f = open(filename,"w") - - f.write(""" + f.write(""" # %s # This file is automatically generated. Do not edit. +_tabversion = %r -_lr_method = %s - -_lr_signature = %s -""" % (filename, repr(_lr_method), repr(Signature.digest()))) - - # Change smaller to 0 to go back to original tables - smaller = 1 +_lr_method = %r - # Factor out names to try and make smaller - if smaller: - items = { } +_lr_signature = %r + """ % (filename, __tabversion__, self.lr_method, signature)) - for s,nd in _lr_action.items(): - for name,v in nd.items(): - i = items.get(name) - if not i: - i = ([],[]) - items[name] = i - i[0].append(s) - i[1].append(v) + # Change smaller to 0 to go back to original tables + smaller = 1 - f.write("\n_lr_action_items = {") - for k,v in items.items(): - f.write("%r:([" % k) - for i in v[0]: - f.write("%r," % i) - f.write("],[") - for i in v[1]: - f.write("%r," % i) - - f.write("]),") - f.write("}\n") - - f.write(""" + # Factor out names to try and make smaller + if smaller: + items = { } + + for s,nd in self.lr_action.items(): + for name,v in nd.items(): + i = items.get(name) + if not i: + i = ([],[]) + items[name] = i + i[0].append(s) + i[1].append(v) + + f.write("\n_lr_action_items = {") + for k,v in items.items(): + f.write("%r:([" % k) + for i in v[0]: + f.write("%r," % i) + f.write("],[") + for i in v[1]: + f.write("%r," % i) + + f.write("]),") + f.write("}\n") + + f.write(""" _lr_action = { } for _k, _v in _lr_action_items.items(): for _x,_y in zip(_v[0],_v[1]): - if not _lr_action.has_key(_x): _lr_action[_x] = { } + if not _x in _lr_action: _lr_action[_x] = { } _lr_action[_x][_k] = _y del _lr_action_items """) - else: - f.write("\n_lr_action = { "); - for k,v in _lr_action.items(): - f.write("(%r,%r):%r," % (k[0],k[1],v)) - f.write("}\n"); - - if smaller: - # Factor out names to try and make smaller - items = { } - - for s,nd in _lr_goto.items(): - for name,v in nd.items(): - i = items.get(name) - if not i: - i = ([],[]) - items[name] = i - i[0].append(s) - i[1].append(v) - - f.write("\n_lr_goto_items = {") - for k,v in items.items(): - f.write("%r:([" % k) - for i in v[0]: - f.write("%r," % i) - f.write("],[") - for i in v[1]: - f.write("%r," % i) - - f.write("]),") - f.write("}\n") - - f.write(""" + else: + f.write("\n_lr_action = { "); + for k,v in self.lr_action.items(): + f.write("(%r,%r):%r," % (k[0],k[1],v)) + f.write("}\n"); + + if smaller: + # Factor out names to try and make smaller + items = { } + + for s,nd in self.lr_goto.items(): + for name,v in nd.items(): + i = items.get(name) + if not i: + i = ([],[]) + items[name] = i + i[0].append(s) + i[1].append(v) + + f.write("\n_lr_goto_items = {") + for k,v in items.items(): + f.write("%r:([" % k) + for i in v[0]: + f.write("%r," % i) + f.write("],[") + for i in v[1]: + f.write("%r," % i) + + f.write("]),") + f.write("}\n") + + f.write(""" _lr_goto = { } for _k, _v in _lr_goto_items.items(): for _x,_y in zip(_v[0],_v[1]): - if not _lr_goto.has_key(_x): _lr_goto[_x] = { } + if not _x in _lr_goto: _lr_goto[_x] = { } _lr_goto[_x][_k] = _y del _lr_goto_items """) - else: - f.write("\n_lr_goto = { "); - for k,v in _lr_goto.items(): - f.write("(%r,%r):%r," % (k[0],k[1],v)) - f.write("}\n"); - - # Write production table - f.write("_lr_productions = [\n") - for p in Productions: - if p: - if (p.func): - f.write(" (%r,%d,%r,%r,%d),\n" % (p.name, p.len, p.func.__name__,p.file,p.line)) - else: - f.write(" (%r,%d,None,None,None),\n" % (p.name, p.len)) else: - f.write(" None,\n") - f.write("]\n") + f.write("\n_lr_goto = { "); + for k,v in self.lr_goto.items(): + f.write("(%r,%r):%r," % (k[0],k[1],v)) + f.write("}\n"); + + # Write production table + f.write("_lr_productions = [\n") + for p in self.lr_productions: + if p.func: + f.write(" (%r,%r,%d,%r,%r,%d),\n" % (p.str,p.name, p.len, p.func,p.file,p.line)) + else: + f.write(" (%r,%r,%d,None,None,None),\n" % (str(p),p.name, p.len)) + f.write("]\n") + f.close() - f.close() + except IOError: + e = sys.exc_info()[1] + sys.stderr.write("Unable to create '%s'\n" % filename) + sys.stderr.write(str(e)+"\n") + return - except IOError,e: - print >>sys.stderr, "Unable to create '%s'" % filename - print >>sys.stderr, e - return -def lr_read_tables(module=tab_module,optimize=0): - global _lr_action, _lr_goto, _lr_productions, _lr_method - try: - exec "import %s as parsetab" % module - - if (optimize) or (Signature.digest() == parsetab._lr_signature): - _lr_action = parsetab._lr_action - _lr_goto = parsetab._lr_goto - _lr_productions = parsetab._lr_productions - _lr_method = parsetab._lr_method - return 1 - else: - return 0 - - except (ImportError,AttributeError): - return 0 + # ----------------------------------------------------------------------------- + # pickle_table() + # + # This function pickles the LR parsing tables to a supplied file object + # ----------------------------------------------------------------------------- + def pickle_table(self,filename,signature=""): + try: + import cPickle as pickle + except ImportError: + import pickle + outf = open(filename,"wb") + pickle.dump(__tabversion__,outf,pickle_protocol) + pickle.dump(self.lr_method,outf,pickle_protocol) + pickle.dump(signature,outf,pickle_protocol) + pickle.dump(self.lr_action,outf,pickle_protocol) + pickle.dump(self.lr_goto,outf,pickle_protocol) + + outp = [] + for p in self.lr_productions: + if p.func: + outp.append((p.str,p.name, p.len, p.func,p.file,p.line)) + else: + outp.append((str(p),p.name,p.len,None,None,None)) + pickle.dump(outp,outf,pickle_protocol) + outf.close() # ----------------------------------------------------------------------------- -# yacc(module) +# === INTROSPECTION === # -# Build the parser module +# The following functions and classes are used to implement the PLY +# introspection features followed by the yacc() function itself. # ----------------------------------------------------------------------------- -def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, start=None, check_recursion=1, optimize=0,write_tables=1,debugfile=debug_file,outputdir=''): - global yaccdebug - yaccdebug = debug +# ----------------------------------------------------------------------------- +# get_caller_module_dict() +# +# This function returns a dictionary containing all of the symbols defined within +# a caller further down the call stack. This is used to get the environment +# associated with the yacc() call if none was provided. +# ----------------------------------------------------------------------------- - initialize_vars() - files = { } - error = 0 +def get_caller_module_dict(levels): + try: + raise RuntimeError + except RuntimeError: + e,b,t = sys.exc_info() + f = t.tb_frame + while levels > 0: + f = f.f_back + levels -= 1 + ldict = f.f_globals.copy() + if f.f_globals != f.f_locals: + ldict.update(f.f_locals) + + return ldict +# ----------------------------------------------------------------------------- +# parse_grammar() +# +# This takes a raw grammar rule string and parses it into production data +# ----------------------------------------------------------------------------- +def parse_grammar(doc,file,line): + grammar = [] + # Split the doc string into lines + pstrings = doc.splitlines() + lastp = None + dline = line + for ps in pstrings: + dline += 1 + p = ps.split() + if not p: continue + try: + if p[0] == '|': + # This is a continuation of a previous rule + if not lastp: + raise SyntaxError("%s:%d: Misplaced '|'" % (file,dline)) + prodname = lastp + syms = p[1:] + else: + prodname = p[0] + lastp = prodname + syms = p[2:] + assign = p[1] + if assign != ':' and assign != '::=': + raise SyntaxError("%s:%d: Syntax error. Expected ':'" % (file,dline)) - # Add parsing method to signature - Signature.update(method) + grammar.append((file,dline,prodname,syms)) + except SyntaxError: + raise + except Exception: + raise SyntaxError("%s:%d: Syntax error in rule '%s'" % (file,dline,ps.strip())) - # If a "module" parameter was supplied, extract its dictionary. - # Note: a module may in fact be an instance as well. + return grammar - if module: - # User supplied a module object. - if isinstance(module, types.ModuleType): - ldict = module.__dict__ - elif isinstance(module, _INSTANCETYPE): - _items = [(k,getattr(module,k)) for k in dir(module)] - ldict = { } - for i in _items: - ldict[i[0]] = i[1] +# ----------------------------------------------------------------------------- +# ParserReflect() +# +# This class represents information extracted for building a parser including +# start symbol, error function, tokens, precedence list, action functions, +# etc. +# ----------------------------------------------------------------------------- +class ParserReflect(object): + def __init__(self,pdict,log=None): + self.pdict = pdict + self.start = None + self.error_func = None + self.tokens = None + self.files = {} + self.grammar = [] + self.error = 0 + + if log is None: + self.log = PlyLogger(sys.stderr) else: - raise ValueError,"Expected a module" - - else: - # No module given. We might be able to get information from the caller. - # Throw an exception and unwind the traceback to get the globals - + self.log = log + + # Get all of the basic information + def get_all(self): + self.get_start() + self.get_error_func() + self.get_tokens() + self.get_precedence() + self.get_pfunctions() + + # Validate all of the information + def validate_all(self): + self.validate_start() + self.validate_error_func() + self.validate_tokens() + self.validate_precedence() + self.validate_pfunctions() + self.validate_files() + return self.error + + # Compute a signature over the grammar + def signature(self): try: - raise RuntimeError - except RuntimeError: - e,b,t = sys.exc_info() - f = t.tb_frame - f = f.f_back # Walk out to our calling function - ldict = f.f_globals # Grab its globals dictionary - - # Add starting symbol to signature - if not start: - start = ldict.get("start",None) - if start: - Signature.update(start) - - # If running in optimized mode. We're going to - - if (optimize and lr_read_tables(tabmodule,1)): - # Read parse table - del Productions[:] - for p in _lr_productions: - if not p: - Productions.append(None) - else: - m = MiniProduction() - m.name = p[0] - m.len = p[1] - m.file = p[3] - m.line = p[4] - if p[2]: - m.func = ldict[p[2]] - Productions.append(m) - - else: - # Get the tokens map - if (module and isinstance(module,_INSTANCETYPE)): - tokens = getattr(module,"tokens",None) - else: - tokens = ldict.get("tokens",None) - - if not tokens: - raise YaccError,"module does not define a list 'tokens'" - if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)): - raise YaccError,"tokens must be a list or tuple." + from hashlib import md5 + except ImportError: + from md5 import md5 + try: + sig = md5() + if self.start: + sig.update(self.start.encode('latin-1')) + if self.prec: + sig.update("".join(["".join(p) for p in self.prec]).encode('latin-1')) + if self.tokens: + sig.update(" ".join(self.tokens).encode('latin-1')) + for f in self.pfuncs: + if f[3]: + sig.update(f[3].encode('latin-1')) + except (TypeError,ValueError): + pass + return sig.digest() + + # ----------------------------------------------------------------------------- + # validate_file() + # + # This method checks to see if there are duplicated p_rulename() functions + # in the parser module file. Without this function, it is really easy for + # users to make mistakes by cutting and pasting code fragments (and it's a real + # bugger to try and figure out why the resulting parser doesn't work). Therefore, + # we just do a little regular expression pattern matching of def statements + # to try and detect duplicates. + # ----------------------------------------------------------------------------- + + def validate_files(self): + # Match def p_funcname( + fre = re.compile(r'\s*def\s+(p_[a-zA-Z_0-9]*)\(') + + for filename in self.files.keys(): + base,ext = os.path.splitext(filename) + if ext != '.py': return 1 # No idea. Assume it's okay. - # Check to see if a requires dictionary is defined. - requires = ldict.get("require",None) - if requires: - if not (isinstance(requires,types.DictType)): - raise YaccError,"require must be a dictionary." + try: + f = open(filename) + lines = f.readlines() + f.close() + except IOError: + continue - for r,v in requires.items(): - try: - if not (isinstance(v,types.ListType)): - raise TypeError - v1 = [x.split(".") for x in v] - Requires[r] = v1 - except StandardError: - print >>sys.stderr, "Invalid specification for rule '%s' in require. Expected a list of strings" % r - - - # Build the dictionary of terminals. We a record a 0 in the - # dictionary to track whether or not a terminal is actually - # used in the grammar - - if 'error' in tokens: - print >>sys.stderr, "yacc: Illegal token 'error'. Is a reserved word." - raise YaccError,"Illegal token name" - - for n in tokens: - if Terminals.has_key(n): - print >>sys.stderr, "yacc: Warning. Token '%s' multiply defined." % n - Terminals[n] = [ ] - - Terminals['error'] = [ ] - - # Get the precedence map (if any) - prec = ldict.get("precedence",None) - if prec: - if not (isinstance(prec,types.ListType) or isinstance(prec,types.TupleType)): - raise YaccError,"precedence must be a list or tuple." - add_precedence(prec) - Signature.update(repr(prec)) - - for n in tokens: - if not Precedence.has_key(n): - Precedence[n] = ('right',0) # Default, right associative, 0 precedence - - # Look for error handler - ef = ldict.get('p_error',None) - if ef: - if isinstance(ef,types.FunctionType): + counthash = { } + for linen,l in enumerate(lines): + linen += 1 + m = fre.match(l) + if m: + name = m.group(1) + prev = counthash.get(name) + if not prev: + counthash[name] = linen + else: + self.log.warning("%s:%d: Function %s redefined. Previously defined on line %d", filename,linen,name,prev) + + # Get the start symbol + def get_start(self): + self.start = self.pdict.get('start') + + # Validate the start symbol + def validate_start(self): + if self.start is not None: + if not isinstance(self.start,str): + self.log.error("'start' must be a string") + + # Look for error handler + def get_error_func(self): + self.error_func = self.pdict.get('p_error') + + # Validate the error function + def validate_error_func(self): + if self.error_func: + if isinstance(self.error_func,types.FunctionType): ismethod = 0 - elif isinstance(ef, types.MethodType): + elif isinstance(self.error_func, types.MethodType): ismethod = 1 else: - raise YaccError,"'p_error' defined, but is not a function or method." - eline = ef.func_code.co_firstlineno - efile = ef.func_code.co_filename - files[efile] = None - - if (ef.func_code.co_argcount != 1+ismethod): - raise YaccError,"%s:%d: p_error() requires 1 argument." % (efile,eline) - global Errorfunc - Errorfunc = ef - else: - print >>sys.stderr, "yacc: Warning. no p_error() function is defined." + self.log.error("'p_error' defined, but is not a function or method") + self.error = 1 + return - # Get the list of built-in functions with p_ prefix - symbols = [ldict[f] for f in ldict.keys() - if (type(ldict[f]) in (types.FunctionType, types.MethodType) and ldict[f].__name__[:2] == 'p_' - and ldict[f].__name__ != 'p_error')] + eline = func_code(self.error_func).co_firstlineno + efile = func_code(self.error_func).co_filename + self.files[efile] = 1 - # Check for non-empty symbols - if len(symbols) == 0: - raise YaccError,"no rules of the form p_rulename are defined." + if (func_code(self.error_func).co_argcount != 1+ismethod): + self.log.error("%s:%d: p_error() requires 1 argument",efile,eline) + self.error = 1 - # Sort the symbols by line number - symbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno)) + # Get the tokens map + def get_tokens(self): + tokens = self.pdict.get("tokens",None) + if not tokens: + self.log.error("No token list is defined") + self.error = 1 + return + + if not isinstance(tokens,(list, tuple)): + self.log.error("tokens must be a list or tuple") + self.error = 1 + return + + if not tokens: + self.log.error("tokens is empty") + self.error = 1 + return + + self.tokens = tokens + + # Validate the tokens + def validate_tokens(self): + # Validate the tokens. + if 'error' in self.tokens: + self.log.error("Illegal token name 'error'. Is a reserved word") + self.error = 1 + return + + terminals = {} + for n in self.tokens: + if n in terminals: + self.log.warning("Token '%s' multiply defined", n) + terminals[n] = 1 + + # Get the precedence map (if any) + def get_precedence(self): + self.prec = self.pdict.get("precedence",None) + + # Validate and parse the precedence map + def validate_precedence(self): + preclist = [] + if self.prec: + if not isinstance(self.prec,(list,tuple)): + self.log.error("precedence must be a list or tuple") + self.error = 1 + return + for level,p in enumerate(self.prec): + if not isinstance(p,(list,tuple)): + self.log.error("Bad precedence table") + self.error = 1 + return - # Add all of the symbols to the grammar - for f in symbols: - if (add_function(f)) < 0: - error += 1 + if len(p) < 2: + self.log.error("Malformed precedence entry %s. Must be (assoc, term, ..., term)",p) + self.error = 1 + return + assoc = p[0] + if not isinstance(assoc,str): + self.log.error("precedence associativity must be a string") + self.error = 1 + return + for term in p[1:]: + if not isinstance(term,str): + self.log.error("precedence items must be strings") + self.error = 1 + return + preclist.append((term,assoc,level+1)) + self.preclist = preclist + + # Get all p_functions from the grammar + def get_pfunctions(self): + p_functions = [] + for name, item in self.pdict.items(): + if name[:2] != 'p_': continue + if name == 'p_error': continue + if isinstance(item,(types.FunctionType,types.MethodType)): + line = func_code(item).co_firstlineno + file = func_code(item).co_filename + p_functions.append((line,file,name,item.__doc__)) + + # Sort all of the actions by line number + p_functions.sort() + self.pfuncs = p_functions + + + # Validate all of the p_functions + def validate_pfunctions(self): + grammar = [] + # Check for non-empty symbols + if len(self.pfuncs) == 0: + self.log.error("no rules of the form p_rulename are defined") + self.error = 1 + return + + for line, file, name, doc in self.pfuncs: + func = self.pdict[name] + if isinstance(func, types.MethodType): + reqargs = 2 else: - files[f.func_code.co_filename] = None - - # Make a signature of the docstrings - for f in symbols: - if f.__doc__: - Signature.update(f.__doc__) - - lr_init_vars() - - if error: - raise YaccError,"Unable to construct parser." - - if not lr_read_tables(tabmodule): - - # Validate files - for filename in files.keys(): - if not validate_file(filename): - error = 1 - - # Validate dictionary - validate_dict(ldict) - - if start and not Prodnames.has_key(start): - raise YaccError,"Bad starting symbol '%s'" % start + reqargs = 1 + if func_code(func).co_argcount > reqargs: + self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,func.__name__) + self.error = 1 + elif func_code(func).co_argcount < reqargs: + self.log.error("%s:%d: Rule '%s' requires an argument",file,line,func.__name__) + self.error = 1 + elif not func.__doc__: + self.log.warning("%s:%d: No documentation string specified in function '%s' (ignored)",file,line,func.__name__) + else: + try: + parsed_g = parse_grammar(doc,file,line) + for g in parsed_g: + grammar.append((name, g)) + except SyntaxError: + e = sys.exc_info()[1] + self.log.error(str(e)) + self.error = 1 + + # Looks like a valid grammar rule + # Mark the file in which defined. + self.files[file] = 1 + + # Secondary validation step that looks for p_ definitions that are not functions + # or functions that look like they might be grammar rules. + + for n,v in self.pdict.items(): + if n[0:2] == 'p_' and isinstance(v, (types.FunctionType, types.MethodType)): continue + if n[0:2] == 't_': continue + if n[0:2] == 'p_' and n != 'p_error': + self.log.warning("'%s' not defined as a function", n) + if ((isinstance(v,types.FunctionType) and func_code(v).co_argcount == 1) or + (isinstance(v,types.MethodType) and func_code(v).co_argcount == 2)): + try: + doc = v.__doc__.split(" ") + if doc[1] == ':': + self.log.warning("%s:%d: Possible grammar rule '%s' defined without p_ prefix", + func_code(v).co_filename, func_code(v).co_firstlineno,n) + except Exception: + pass - augment_grammar(start) - error = verify_productions(cycle_check=check_recursion) - otherfunc = [ldict[f] for f in ldict.keys() - if (type(f) in (types.FunctionType,types.MethodType) and ldict[f].__name__[:2] != 'p_')] + self.grammar = grammar - if error: - raise YaccError,"Unable to construct parser." +# ----------------------------------------------------------------------------- +# yacc(module) +# +# Build a parser +# ----------------------------------------------------------------------------- - build_lritems() - compute_first1() - compute_follow(start) +def yacc(method='LALR', debug=yaccdebug, module=None, tabmodule=tab_module, start=None, + check_recursion=1, optimize=0, write_tables=1, debugfile=debug_file,outputdir='', + debuglog=None, errorlog = None, picklefile=None): - if method in ['SLR','LALR']: - lr_parse_table(method) - else: - raise YaccError, "Unknown parsing method '%s'" % method + global parse # Reference to the parsing method of the last built parser - if write_tables: - lr_write_tables(tabmodule,outputdir) + # If pickling is enabled, table files are not created - if yaccdebug: - try: - f = open(os.path.join(outputdir,debugfile),"w") - f.write(_vfc.getvalue()) - f.write("\n\n") - f.write(_vf.getvalue()) - f.close() - except IOError,e: - print >>sys.stderr, "yacc: can't create '%s'" % debugfile,e + if picklefile: + write_tables = 0 - # Made it here. Create a parser object and set up its internal state. - # Set global parse() method to bound method of parser object. + if errorlog is None: + errorlog = PlyLogger(sys.stderr) - p = Parser("xyzzy") - p.productions = Productions - p.errorfunc = Errorfunc - p.action = _lr_action - p.goto = _lr_goto - p.method = _lr_method - p.require = Requires + # Get the module dictionary used for the parser + if module: + _items = [(k,getattr(module,k)) for k in dir(module)] + pdict = dict(_items) + else: + pdict = get_caller_module_dict(2) - global parse - parse = p.parse + # Collect parser information from the dictionary + pinfo = ParserReflect(pdict,log=errorlog) + pinfo.get_all() - global parser - parser = p + if pinfo.error: + raise YaccError("Unable to build parser") - # Clean up all of the globals we created - if (not optimize): - yacc_cleanup() - return p + # Check signature against table files (if any) + signature = pinfo.signature() -# yacc_cleanup function. Delete all of the global variables -# used during table construction + # Read the tables + try: + lr = LRTable() + if picklefile: + read_signature = lr.read_pickle(picklefile) + else: + read_signature = lr.read_table(tabmodule) + if optimize or (read_signature == signature): + try: + lr.bind_callables(pinfo.pdict) + parser = LRParser(lr,pinfo.error_func) + parse = parser.parse + return parser + except Exception: + e = sys.exc_info()[1] + errorlog.warning("There was a problem loading the table file: %s", repr(e)) + except VersionError: + e = sys.exc_info() + errorlog.warning(str(e)) + except Exception: + pass + + if debuglog is None: + if debug: + debuglog = PlyLogger(open(debugfile,"w")) + else: + debuglog = NullLogger() -def yacc_cleanup(): - global _lr_action, _lr_goto, _lr_method, _lr_goto_cache - del _lr_action, _lr_goto, _lr_method, _lr_goto_cache + debuglog.info("Created by PLY version %s (http://www.dabeaz.com/ply)", __version__) - global Productions, Prodnames, Prodmap, Terminals - global Nonterminals, First, Follow, Precedence, LRitems - global Errorfunc, Signature, Requires - del Productions, Prodnames, Prodmap, Terminals - del Nonterminals, First, Follow, Precedence, LRitems - del Errorfunc, Signature, Requires + errors = 0 - global _vf, _vfc - del _vf, _vfc + # Validate the parser information + if pinfo.validate_all(): + raise YaccError("Unable to build parser") + + if not pinfo.error_func: + errorlog.warning("no p_error() function is defined") + # Create a grammar object + grammar = Grammar(pinfo.tokens) -# Stub that raises an error if parsing is attempted without first calling yacc() -def parse(*args,**kwargs): - raise YaccError, "yacc: No parser built with yacc()" + # Set precedence level for terminals + for term, assoc, level in pinfo.preclist: + try: + grammar.set_precedence(term,assoc,level) + except GrammarError: + e = sys.exc_info()[1] + errorlog.warning("%s",str(e)) + + # Add productions to the grammar + for funcname, gram in pinfo.grammar: + file, line, prodname, syms = gram + try: + grammar.add_production(prodname,syms,funcname,file,line) + except GrammarError: + e = sys.exc_info()[1] + errorlog.error("%s",str(e)) + errors = 1 + # Set the grammar start symbols + try: + if start is None: + grammar.set_start(pinfo.start) + else: + grammar.set_start(start) + except GrammarError: + e = sys.exc_info()[1] + errorlog.error(str(e)) + errors = 1 + + if errors: + raise YaccError("Unable to build parser") + + # Verify the grammar structure + undefined_symbols = grammar.undefined_symbols() + for sym, prod in undefined_symbols: + errorlog.error("%s:%d: Symbol '%s' used, but not defined as a token or a rule",prod.file,prod.line,sym) + errors = 1 + + unused_terminals = grammar.unused_terminals() + if unused_terminals: + debuglog.info("") + debuglog.info("Unused terminals:") + debuglog.info("") + for term in unused_terminals: + errorlog.warning("Token '%s' defined, but not used", term) + debuglog.info(" %s", term) + + # Print out all productions to the debug log + if debug: + debuglog.info("") + debuglog.info("Grammar") + debuglog.info("") + for n,p in enumerate(grammar.Productions): + debuglog.info("Rule %-5d %s", n, p) + + # Find unused non-terminals + unused_rules = grammar.unused_rules() + for prod in unused_rules: + errorlog.warning("%s:%d: Rule '%s' defined, but not used", prod.file, prod.line, prod.name) + + if len(unused_terminals) == 1: + errorlog.warning("There is 1 unused token") + if len(unused_terminals) > 1: + errorlog.warning("There are %d unused tokens", len(unused_terminals)) + + if len(unused_rules) == 1: + errorlog.warning("There is 1 unused rule") + if len(unused_rules) > 1: + errorlog.warning("There are %d unused rules", len(unused_rules)) + + if debug: + debuglog.info("") + debuglog.info("Terminals, with rules where they appear") + debuglog.info("") + terms = list(grammar.Terminals) + terms.sort() + for term in terms: + debuglog.info("%-20s : %s", term, " ".join([str(s) for s in grammar.Terminals[term]])) + + debuglog.info("") + debuglog.info("Nonterminals, with rules where they appear") + debuglog.info("") + nonterms = list(grammar.Nonterminals) + nonterms.sort() + for nonterm in nonterms: + debuglog.info("%-20s : %s", nonterm, " ".join([str(s) for s in grammar.Nonterminals[nonterm]])) + debuglog.info("") + + if check_recursion: + unreachable = grammar.find_unreachable() + for u in unreachable: + errorlog.warning("Symbol '%s' is unreachable",u) + + infinite = grammar.infinite_cycles() + for inf in infinite: + errorlog.error("Infinite recursion detected for symbol '%s'", inf) + errors = 1 + + unused_prec = grammar.unused_precedence() + for term, assoc in unused_prec: + errorlog.error("Precedence rule '%s' defined for unknown symbol '%s'", assoc, term) + errors = 1 + + if errors: + raise YaccError("Unable to build parser") + + # Run the LRGeneratedTable on the grammar + if debug: + errorlog.debug("Generating %s tables", method) + + lr = LRGeneratedTable(grammar,method,debuglog) + + if debug: + num_sr = len(lr.sr_conflicts) + + # Report shift/reduce and reduce/reduce conflicts + if num_sr == 1: + errorlog.warning("1 shift/reduce conflict") + elif num_sr > 1: + errorlog.warning("%d shift/reduce conflicts", num_sr) + + num_rr = len(lr.rr_conflicts) + if num_rr == 1: + errorlog.warning("1 reduce/reduce conflict") + elif num_rr > 1: + errorlog.warning("%d reduce/reduce conflicts", num_rr) + + # Write out conflicts to the output file + if debug and (lr.sr_conflicts or lr.rr_conflicts): + debuglog.warning("") + debuglog.warning("Conflicts:") + debuglog.warning("") + + for state, tok, resolution in lr.sr_conflicts: + debuglog.warning("shift/reduce conflict for %s in state %d resolved as %s", tok, state, resolution) + + already_reported = {} + for state, rule, rejected in lr.rr_conflicts: + if (state,id(rule),id(rejected)) in already_reported: + continue + debuglog.warning("reduce/reduce conflict in state %d resolved using rule (%s)", state, rule) + debuglog.warning("rejected rule (%s) in state %d", rejected,state) + errorlog.warning("reduce/reduce conflict in state %d resolved using rule (%s)", state, rule) + errorlog.warning("rejected rule (%s) in state %d", rejected, state) + already_reported[state,id(rule),id(rejected)] = 1 + + warned_never = [] + for state, rule, rejected in lr.rr_conflicts: + if not rejected.reduced and (rejected not in warned_never): + debuglog.warning("Rule (%s) is never reduced", rejected) + errorlog.warning("Rule (%s) is never reduced", rejected) + warned_never.append(rejected) + + # Write the table file if requested + if write_tables: + lr.write_table(tabmodule,outputdir,signature) + + # Write a pickled version of the tables + if picklefile: + lr.pickle_table(picklefile,signature) + + # Build the parser + lr.bind_callables(pinfo.pdict) + parser = LRParser(lr,pinfo.error_func) + + parse = parser.parse + return parser diff --git a/ext/ply/setup.py b/ext/ply/setup.py index f743ac78c..606b29cde 100644 --- a/ext/ply/setup.py +++ b/ext/ply/setup.py @@ -1,23 +1,22 @@ -from distutils.core import setup +try: + from setuptools import setup +except ImportError: + from distutils.core import setup setup(name = "ply", description="Python Lex & Yacc", long_description = """ -PLY is yet another implementation of lex and yacc for Python. Although several other -parsing tools are available for Python, there are several reasons why you might -want to take a look at PLY: +PLY is yet another implementation of lex and yacc for Python. Some notable +features include the fact that its implemented entirely in Python and it +uses LALR(1) parsing which is efficient and well suited for larger grammars. -It's implemented entirely in Python. +PLY provides most of the standard lex/yacc features including support for empty +productions, precedence rules, error recovery, and support for ambiguous grammars. -It uses LR-parsing which is reasonably efficient and well suited for larger grammars. - -PLY provides most of the standard lex/yacc features including support for empty -productions, precedence rules, error recovery, and support for ambiguous grammars. - -PLY is extremely easy to use and provides very extensive error checking. +PLY is extremely easy to use and provides very extensive error checking. """, - license="""Lesser GPL (LGPL)""", - version = "2.3", + license="""BSD""", + version = "3.2", author = "David Beazley", author_email = "dave@dabeaz.com", maintainer = "David Beazley", diff --git a/ext/ply/test/calclex.py b/ext/ply/test/calclex.py index d3e873266..67d245f19 100644 --- a/ext/ply/test/calclex.py +++ b/ext/ply/test/calclex.py @@ -3,7 +3,7 @@ # ----------------------------------------------------------------------------- import sys -sys.path.append("..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex tokens = ( @@ -28,7 +28,7 @@ def t_NUMBER(t): try: t.value = int(t.value) except ValueError: - print "Integer value too large", t.value + print("Integer value too large %s" % t.value) t.value = 0 return t @@ -37,11 +37,11 @@ t_ignore = " \t" def t_newline(t): r'\n+' t.lineno += t.value.count("\n") - + def t_error(t): - print "Illegal character '%s'" % t.value[0] + print("Illegal character '%s'" % t.value[0]) t.lexer.skip(1) - + # Build the lexer lex.lex() diff --git a/ext/ply/test/cleanup.sh b/ext/ply/test/cleanup.sh index d7d99b65f..9db936837 100644..100755 --- a/ext/ply/test/cleanup.sh +++ b/ext/ply/test/cleanup.sh @@ -1,4 +1,4 @@ #!/bin/sh -rm -f *~ *.pyc *.dif *.out +rm -f *~ *.pyc *.pyo *.dif *.out diff --git a/ext/ply/test/lex_closure.py b/ext/ply/test/lex_closure.py new file mode 100644 index 000000000..30ee67912 --- /dev/null +++ b/ext/ply/test/lex_closure.py @@ -0,0 +1,54 @@ +# ----------------------------------------------------------------------------- +# lex_closure.py +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.lex as lex + +tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + +def make_calc(): + + # Tokens + + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_EQUALS = r'=' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + + def t_NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + + t_ignore = " \t" + + def t_newline(t): + r'\n+' + t.lineno += t.value.count("\n") + + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + # Build the lexer + return lex.lex() + +make_calc() +lex.runmain(data="3+4") + + + diff --git a/ext/ply/test/lex_doc1.exp b/ext/ply/test/lex_doc1.exp deleted file mode 100644 index 5b63c1e91..000000000 --- a/ext/ply/test/lex_doc1.exp +++ /dev/null @@ -1 +0,0 @@ -./lex_doc1.py:18: No regular expression defined for rule 't_NUMBER' diff --git a/ext/ply/test/lex_doc1.py b/ext/ply/test/lex_doc1.py index 3951b5c5d..8a2bfcce8 100644 --- a/ext/ply/test/lex_doc1.py +++ b/ext/ply/test/lex_doc1.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_doc1.py # # Missing documentation string import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -21,10 +21,6 @@ def t_NUMBER(t): def t_error(t): pass - -import sys -sys.tracebacklimit = 0 - lex.lex() diff --git a/ext/ply/test/lex_dup1.exp b/ext/ply/test/lex_dup1.exp deleted file mode 100644 index 2098a40e5..000000000 --- a/ext/ply/test/lex_dup1.exp +++ /dev/null @@ -1,2 +0,0 @@ -./lex_dup1.py:20: Rule t_NUMBER redefined. Previously defined on line 18 -SyntaxError: lex: Unable to build lexer. diff --git a/ext/ply/test/lex_dup1.py b/ext/ply/test/lex_dup1.py index 68f80925b..fd04cdb79 100644 --- a/ext/ply/test/lex_dup1.py +++ b/ext/ply/test/lex_dup1.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_dup1.py # # Duplicated rule specifiers import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -22,7 +22,7 @@ t_NUMBER = r'\d+' def t_error(t): pass -sys.tracebacklimit = 0 + lex.lex() diff --git a/ext/ply/test/lex_dup2.exp b/ext/ply/test/lex_dup2.exp deleted file mode 100644 index d327cfe47..000000000 --- a/ext/ply/test/lex_dup2.exp +++ /dev/null @@ -1,2 +0,0 @@ -./lex_dup2.py:22: Rule t_NUMBER redefined. Previously defined on line 18 -SyntaxError: lex: Unable to build lexer. diff --git a/ext/ply/test/lex_dup2.py b/ext/ply/test/lex_dup2.py index f4d346e75..870e5e7d1 100644 --- a/ext/ply/test/lex_dup2.py +++ b/ext/ply/test/lex_dup2.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_dup2.py # # Duplicated rule specifiers import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -26,7 +26,7 @@ def t_NUMBER(t): def t_error(t): pass -sys.tracebacklimit = 0 + lex.lex() diff --git a/ext/ply/test/lex_dup3.exp b/ext/ply/test/lex_dup3.exp deleted file mode 100644 index ec0680c6c..000000000 --- a/ext/ply/test/lex_dup3.exp +++ /dev/null @@ -1,2 +0,0 @@ -./lex_dup3.py:20: Rule t_NUMBER redefined. Previously defined on line 18 -SyntaxError: lex: Unable to build lexer. diff --git a/ext/ply/test/lex_dup3.py b/ext/ply/test/lex_dup3.py index e17b52059..94b5592eb 100644 --- a/ext/ply/test/lex_dup3.py +++ b/ext/ply/test/lex_dup3.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_dup3.py # # Duplicated rule specifiers import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -24,7 +24,7 @@ def t_NUMBER(t): def t_error(t): pass -sys.tracebacklimit = 0 + lex.lex() diff --git a/ext/ply/test/lex_empty.exp b/ext/ply/test/lex_empty.exp deleted file mode 100644 index af38602d5..000000000 --- a/ext/ply/test/lex_empty.exp +++ /dev/null @@ -1 +0,0 @@ -SyntaxError: lex: no rules of the form t_rulename are defined. diff --git a/ext/ply/test/lex_empty.py b/ext/ply/test/lex_empty.py index 96625f732..e0368bfad 100644 --- a/ext/ply/test/lex_empty.py +++ b/ext/ply/test/lex_empty.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_empty.py # # No rules defined import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -13,7 +13,7 @@ tokens = [ "NUMBER", ] -sys.tracebacklimit = 0 + lex.lex() diff --git a/ext/ply/test/lex_error1.exp b/ext/ply/test/lex_error1.exp deleted file mode 100644 index baa19e5b3..000000000 --- a/ext/ply/test/lex_error1.exp +++ /dev/null @@ -1 +0,0 @@ -lex: Warning. no t_error rule is defined. diff --git a/ext/ply/test/lex_error1.py b/ext/ply/test/lex_error1.py index a99d9bedf..4508a8084 100644 --- a/ext/ply/test/lex_error1.py +++ b/ext/ply/test/lex_error1.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_error1.py # # Missing t_error() rule import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -17,7 +17,7 @@ t_PLUS = r'\+' t_MINUS = r'-' t_NUMBER = r'\d+' -sys.tracebacklimit = 0 + lex.lex() diff --git a/ext/ply/test/lex_error2.exp b/ext/ply/test/lex_error2.exp deleted file mode 100644 index fb1b55c8b..000000000 --- a/ext/ply/test/lex_error2.exp +++ /dev/null @@ -1 +0,0 @@ -SyntaxError: lex: Rule 't_error' must be defined as a function diff --git a/ext/ply/test/lex_error2.py b/ext/ply/test/lex_error2.py index a59c8d454..8040d3902 100644 --- a/ext/ply/test/lex_error2.py +++ b/ext/ply/test/lex_error2.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_error2.py # # t_error defined, but not function import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -19,7 +19,7 @@ t_NUMBER = r'\d+' t_error = "foo" -sys.tracebacklimit = 0 + lex.lex() diff --git a/ext/ply/test/lex_error3.exp b/ext/ply/test/lex_error3.exp deleted file mode 100644 index 1b482bf62..000000000 --- a/ext/ply/test/lex_error3.exp +++ /dev/null @@ -1,2 +0,0 @@ -./lex_error3.py:20: Rule 't_error' requires an argument. -SyntaxError: lex: Unable to build lexer. diff --git a/ext/ply/test/lex_error3.py b/ext/ply/test/lex_error3.py index 584600f3b..1feefb649 100644 --- a/ext/ply/test/lex_error3.py +++ b/ext/ply/test/lex_error3.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_error3.py # # t_error defined as function, but with wrong # args import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -20,7 +20,7 @@ t_NUMBER = r'\d+' def t_error(): pass -sys.tracebacklimit = 0 + lex.lex() diff --git a/ext/ply/test/lex_error4.exp b/ext/ply/test/lex_error4.exp deleted file mode 100644 index 98505a232..000000000 --- a/ext/ply/test/lex_error4.exp +++ /dev/null @@ -1,2 +0,0 @@ -./lex_error4.py:20: Rule 't_error' has too many arguments. -SyntaxError: lex: Unable to build lexer. diff --git a/ext/ply/test/lex_error4.py b/ext/ply/test/lex_error4.py index d05de7490..f4f48db13 100644 --- a/ext/ply/test/lex_error4.py +++ b/ext/ply/test/lex_error4.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_error4.py # # t_error defined as function, but too many args import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -20,7 +20,7 @@ t_NUMBER = r'\d+' def t_error(t,s): pass -sys.tracebacklimit = 0 + lex.lex() diff --git a/ext/ply/test/lex_hedit.exp b/ext/ply/test/lex_hedit.exp deleted file mode 100644 index 7b27dcb57..000000000 --- a/ext/ply/test/lex_hedit.exp +++ /dev/null @@ -1,3 +0,0 @@ -(H_EDIT_DESCRIPTOR,'abc',1,0) -(H_EDIT_DESCRIPTOR,'abcdefghij',1,6) -(H_EDIT_DESCRIPTOR,'xy',1,20) diff --git a/ext/ply/test/lex_hedit.py b/ext/ply/test/lex_hedit.py index 9949549c4..34f15a173 100644 --- a/ext/ply/test/lex_hedit.py +++ b/ext/ply/test/lex_hedit.py @@ -14,7 +14,7 @@ # such tokens # ----------------------------------------------------------------------------- import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -29,16 +29,16 @@ def t_H_EDIT_DESCRIPTOR(t): r"\d+H.*" # This grabs all of the remaining text i = t.value.index('H') n = eval(t.value[:i]) - + # Adjust the tokenizing position t.lexer.lexpos -= len(t.value) - (i+1+n) t.value = t.value[i+1:i+1+n] - return t - + return t + def t_error(t): - print "Illegal character '%s'" % t.value[0] + print("Illegal character '%s'" % t.value[0]) t.lexer.skip(1) - + # Build the lexer lex.lex() lex.runmain(data="3Habc 10Habcdefghij 2Hxy") diff --git a/ext/ply/test/lex_ignore.exp b/ext/ply/test/lex_ignore.exp deleted file mode 100644 index 6b6b67cdc..000000000 --- a/ext/ply/test/lex_ignore.exp +++ /dev/null @@ -1,7 +0,0 @@ -./lex_ignore.py:20: Rule 't_ignore' must be defined as a string. -Traceback (most recent call last): - File "./lex_ignore.py", line 29, in <module> - lex.lex() - File "../ply/lex.py", line 759, in lex - raise SyntaxError,"lex: Unable to build lexer." -SyntaxError: lex: Unable to build lexer. diff --git a/ext/ply/test/lex_ignore.py b/ext/ply/test/lex_ignore.py index 94b026693..6c43b4cff 100644 --- a/ext/ply/test/lex_ignore.py +++ b/ext/ply/test/lex_ignore.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_ignore.py # # Improperly specific ignore declaration import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex diff --git a/ext/ply/test/lex_ignore2.exp b/ext/ply/test/lex_ignore2.exp deleted file mode 100644 index 0eb6bf266..000000000 --- a/ext/ply/test/lex_ignore2.exp +++ /dev/null @@ -1 +0,0 @@ -lex: Warning. t_ignore contains a literal backslash '\' diff --git a/ext/ply/test/lex_ignore2.py b/ext/ply/test/lex_ignore2.py index fc95bd1e5..f60987a6b 100644 --- a/ext/ply/test/lex_ignore2.py +++ b/ext/ply/test/lex_ignore2.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_ignore2.py # # ignore declaration as a raw string import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -22,7 +22,7 @@ t_ignore = r' \t' def t_error(t): pass -import sys + lex.lex() diff --git a/ext/ply/test/lex_literal1.py b/ext/ply/test/lex_literal1.py new file mode 100644 index 000000000..db389c37c --- /dev/null +++ b/ext/ply/test/lex_literal1.py @@ -0,0 +1,25 @@ +# lex_literal1.py +# +# Bad literal specification + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "NUMBER", + ] + +literals = ["+","-","**"] + +def t_NUMBER(t): + r'\d+' + return t + +def t_error(t): + pass + +lex.lex() + + diff --git a/ext/ply/test/lex_literal2.py b/ext/ply/test/lex_literal2.py new file mode 100644 index 000000000..b50b92cd6 --- /dev/null +++ b/ext/ply/test/lex_literal2.py @@ -0,0 +1,25 @@ +# lex_literal2.py +# +# Bad literal specification + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "NUMBER", + ] + +literals = 23 + +def t_NUMBER(t): + r'\d+' + return t + +def t_error(t): + pass + +lex.lex() + + diff --git a/ext/ply/test/lex_many_tokens.py b/ext/ply/test/lex_many_tokens.py new file mode 100644 index 000000000..77ae12baf --- /dev/null +++ b/ext/ply/test/lex_many_tokens.py @@ -0,0 +1,27 @@ +# lex_many_tokens.py +# +# Test lex's ability to handle a large number of tokens (beyond the +# 100-group limit of the re module) + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = ["TOK%d" % i for i in range(1000)] + +for tok in tokens: + if sys.version_info[0] < 3: + exec("t_%s = '%s:'" % (tok,tok)) + else: + exec("t_%s = '%s:'" % (tok,tok), globals()) + +t_ignore = " \t" + +def t_error(t): + pass + +lex.lex(optimize=1,lextab="manytab") +lex.runmain(data="TOK34: TOK143: TOK269: TOK372: TOK452: TOK561: TOK999:") + + diff --git a/ext/ply/test/lex_module.py b/ext/ply/test/lex_module.py new file mode 100644 index 000000000..8bdd3ed47 --- /dev/null +++ b/ext/ply/test/lex_module.py @@ -0,0 +1,10 @@ +# lex_module.py +# + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex +import lex_module_import +lex.lex(module=lex_module_import) +lex.runmain(data="3+4") diff --git a/ext/ply/test/lex_module_import.py b/ext/ply/test/lex_module_import.py new file mode 100644 index 000000000..df4208236 --- /dev/null +++ b/ext/ply/test/lex_module_import.py @@ -0,0 +1,42 @@ +# ----------------------------------------------------------------------------- +# lex_module_import.py +# +# A lexer defined in a module, but built in lex_module.py +# ----------------------------------------------------------------------------- + +tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + +# Tokens + +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_EQUALS = r'=' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def t_NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + +t_ignore = " \t" + +def t_newline(t): + r'\n+' + t.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + diff --git a/ext/ply/test/lex_nowarn.py b/ext/ply/test/lex_nowarn.py deleted file mode 100644 index d60d31c53..000000000 --- a/ext/ply/test/lex_nowarn.py +++ /dev/null @@ -1,30 +0,0 @@ -# lex_token.py -# -# Missing t_error() rule - -import sys -sys.path.insert(0,"..") - -import ply.lex as lex - -tokens = [ - "PLUS", - "MINUS", - "NUMBER", - "NUMBER", - ] - -states = (('foo','exclusive'),) - -t_ignore = ' \t' -t_PLUS = r'\+' -t_MINUS = r'-' -t_NUMBER = r'\d+' - -t_foo_NUMBER = r'\d+' - -sys.tracebacklimit = 0 - -lex.lex(nowarn=1) - - diff --git a/ext/ply/test/lex_object.py b/ext/ply/test/lex_object.py new file mode 100644 index 000000000..7e9f389dd --- /dev/null +++ b/ext/ply/test/lex_object.py @@ -0,0 +1,55 @@ +# ----------------------------------------------------------------------------- +# lex_object.py +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.lex as lex + +class CalcLexer: + tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + + # Tokens + + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_EQUALS = r'=' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + + def t_NUMBER(self,t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + + t_ignore = " \t" + + def t_newline(self,t): + r'\n+' + t.lineno += t.value.count("\n") + + def t_error(self,t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + +calc = CalcLexer() + +# Build the lexer +lex.lex(object=calc) +lex.runmain(data="3+4") + + + + diff --git a/ext/ply/test/lex_opt_alias.py b/ext/ply/test/lex_opt_alias.py new file mode 100644 index 000000000..5d5ed4c4e --- /dev/null +++ b/ext/ply/test/lex_opt_alias.py @@ -0,0 +1,54 @@ +# ----------------------------------------------------------------------------- +# lex_opt_alias.py +# +# Tests ability to match up functions with states, aliases, and +# lexing tables. +# ----------------------------------------------------------------------------- + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +tokens = ( + 'NAME','NUMBER', + ) + +states = (('instdef','inclusive'),('spam','exclusive')) + +literals = ['=','+','-','*','/', '(',')'] + +# Tokens + +def t_instdef_spam_BITS(t): + r'[01-]+' + return t + +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + +t_ANY_NUMBER = NUMBER + +t_ignore = " \t" +t_spam_ignore = t_ignore + +def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +t_spam_error = t_error + +# Build the lexer +import ply.lex as lex +lex.lex(optimize=1,lextab="aliastab") +lex.runmain(data="3+4") diff --git a/ext/ply/test/lex_optimize.py b/ext/ply/test/lex_optimize.py new file mode 100644 index 000000000..0e447e668 --- /dev/null +++ b/ext/ply/test/lex_optimize.py @@ -0,0 +1,50 @@ +# ----------------------------------------------------------------------------- +# lex_optimize.py +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.lex as lex + +tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + +# Tokens + +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_EQUALS = r'=' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def t_NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + +t_ignore = " \t" + +def t_newline(t): + r'\n+' + t.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +lex.lex(optimize=1) +lex.runmain(data="3+4") + + + diff --git a/ext/ply/test/lex_optimize2.py b/ext/ply/test/lex_optimize2.py new file mode 100644 index 000000000..64555f635 --- /dev/null +++ b/ext/ply/test/lex_optimize2.py @@ -0,0 +1,50 @@ +# ----------------------------------------------------------------------------- +# lex_optimize2.py +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.lex as lex + +tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + +# Tokens + +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_EQUALS = r'=' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def t_NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + +t_ignore = " \t" + +def t_newline(t): + r'\n+' + t.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +lex.lex(optimize=1,lextab="opt2tab") +lex.runmain(data="3+4") + + + diff --git a/ext/ply/test/lex_optimize3.py b/ext/ply/test/lex_optimize3.py new file mode 100644 index 000000000..c6c8cce65 --- /dev/null +++ b/ext/ply/test/lex_optimize3.py @@ -0,0 +1,52 @@ +# ----------------------------------------------------------------------------- +# lex_optimize3.py +# +# Writes table in a subdirectory structure. +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.lex as lex + +tokens = ( + 'NAME','NUMBER', + 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', + 'LPAREN','RPAREN', + ) + +# Tokens + +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_EQUALS = r'=' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def t_NUMBER(t): + r'\d+' + try: + t.value = int(t.value) + except ValueError: + print("Integer value too large %s" % t.value) + t.value = 0 + return t + +t_ignore = " \t" + +def t_newline(t): + r'\n+' + t.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +lex.lex(optimize=1,lextab="lexdir.sub.calctab",outputdir="lexdir/sub") +lex.runmain(data="3+4") + + + diff --git a/ext/ply/test/lex_re1.exp b/ext/ply/test/lex_re1.exp deleted file mode 100644 index 4d54f4b89..000000000 --- a/ext/ply/test/lex_re1.exp +++ /dev/null @@ -1,7 +0,0 @@ -lex: Invalid regular expression for rule 't_NUMBER'. unbalanced parenthesis -Traceback (most recent call last): - File "./lex_re1.py", line 25, in <module> - lex.lex() - File "../ply/lex.py", line 759, in lex - raise SyntaxError,"lex: Unable to build lexer." -SyntaxError: lex: Unable to build lexer. diff --git a/ext/ply/test/lex_re1.py b/ext/ply/test/lex_re1.py index 9e544fe0d..5be7aefca 100644 --- a/ext/ply/test/lex_re1.py +++ b/ext/ply/test/lex_re1.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_re1.py # # Bad regular expression in a string import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -20,7 +20,7 @@ t_NUMBER = r'(\d+' def t_error(t): pass -import sys + lex.lex() diff --git a/ext/ply/test/lex_re2.exp b/ext/ply/test/lex_re2.exp deleted file mode 100644 index a4e2e8920..000000000 --- a/ext/ply/test/lex_re2.exp +++ /dev/null @@ -1,7 +0,0 @@ -lex: Regular expression for rule 't_PLUS' matches empty string. -Traceback (most recent call last): - File "./lex_re2.py", line 25, in <module> - lex.lex() - File "../ply/lex.py", line 759, in lex - raise SyntaxError,"lex: Unable to build lexer." -SyntaxError: lex: Unable to build lexer. diff --git a/ext/ply/test/lex_re2.py b/ext/ply/test/lex_re2.py index 522b41592..8dfb8e3fd 100644 --- a/ext/ply/test/lex_re2.py +++ b/ext/ply/test/lex_re2.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_re2.py # # Regular expression rule matches empty string import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -20,7 +20,7 @@ t_NUMBER = r'(\d+)' def t_error(t): pass -import sys + lex.lex() diff --git a/ext/ply/test/lex_re3.exp b/ext/ply/test/lex_re3.exp deleted file mode 100644 index b9ada216d..000000000 --- a/ext/ply/test/lex_re3.exp +++ /dev/null @@ -1,8 +0,0 @@ -lex: Invalid regular expression for rule 't_POUND'. unbalanced parenthesis -lex: Make sure '#' in rule 't_POUND' is escaped with '\#'. -Traceback (most recent call last): - File "./lex_re3.py", line 27, in <module> - lex.lex() - File "../ply/lex.py", line 759, in lex - raise SyntaxError,"lex: Unable to build lexer." -SyntaxError: lex: Unable to build lexer. diff --git a/ext/ply/test/lex_re3.py b/ext/ply/test/lex_re3.py index 099e1568c..e17992537 100644 --- a/ext/ply/test/lex_re3.py +++ b/ext/ply/test/lex_re3.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_re3.py # # Regular expression rule matches empty string import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -22,7 +22,7 @@ t_POUND = r'#' def t_error(t): pass -import sys + lex.lex() diff --git a/ext/ply/test/lex_rule1.exp b/ext/ply/test/lex_rule1.exp deleted file mode 100644 index 0c23ca294..000000000 --- a/ext/ply/test/lex_rule1.exp +++ /dev/null @@ -1,2 +0,0 @@ -lex: t_NUMBER not defined as a function or string -SyntaxError: lex: Unable to build lexer. diff --git a/ext/ply/test/lex_rule1.py b/ext/ply/test/lex_rule1.py index e49a15bba..0406c6f30 100644 --- a/ext/ply/test/lex_rule1.py +++ b/ext/ply/test/lex_rule1.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_rule1.py # -# Rule defined as some other type +# Rule function with incorrect number of arguments import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -20,7 +20,7 @@ t_NUMBER = 1 def t_error(t): pass -sys.tracebacklimit = 0 + lex.lex() diff --git a/ext/ply/test/lex_rule2.py b/ext/ply/test/lex_rule2.py new file mode 100644 index 000000000..1c29d8737 --- /dev/null +++ b/ext/ply/test/lex_rule2.py @@ -0,0 +1,29 @@ +# lex_rule2.py +# +# Rule function with incorrect number of arguments + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +def t_NUMBER(): + r'\d+' + return t + +def t_error(t): + pass + + + +lex.lex() + + diff --git a/ext/ply/test/lex_rule3.py b/ext/ply/test/lex_rule3.py new file mode 100644 index 000000000..9ea94da2f --- /dev/null +++ b/ext/ply/test/lex_rule3.py @@ -0,0 +1,27 @@ +# lex_rule3.py +# +# Rule function with incorrect number of arguments + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + ] + +t_PLUS = r'\+' +t_MINUS = r'-' +def t_NUMBER(t,s): + r'\d+' + return t + +def t_error(t): + pass + +lex.lex() + + diff --git a/ext/ply/test/lex_state1.exp b/ext/ply/test/lex_state1.exp deleted file mode 100644 index facad03cc..000000000 --- a/ext/ply/test/lex_state1.exp +++ /dev/null @@ -1,7 +0,0 @@ -lex: states must be defined as a tuple or list. -Traceback (most recent call last): - File "./lex_state1.py", line 38, in <module> - lex.lex() - File "../ply/lex.py", line 759, in lex - raise SyntaxError,"lex: Unable to build lexer." -SyntaxError: lex: Unable to build lexer. diff --git a/ext/ply/test/lex_state1.py b/ext/ply/test/lex_state1.py index 887bc2345..7528c9154 100644 --- a/ext/ply/test/lex_state1.py +++ b/ext/ply/test/lex_state1.py @@ -3,11 +3,11 @@ # Bad state declaration import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex -tokens = [ +tokens = [ "PLUS", "MINUS", "NUMBER", @@ -23,17 +23,17 @@ t_NUMBER = r'\d+' def t_comment(t): r'/\*' t.lexer.begin('comment') - print "Entering comment state" + print("Entering comment state") def t_comment_body_part(t): r'(.|\n)*\*/' - print "comment body", t + print("comment body %s" % t) t.lexer.begin('INITIAL') def t_error(t): pass -import sys + lex.lex() diff --git a/ext/ply/test/lex_state2.exp b/ext/ply/test/lex_state2.exp deleted file mode 100644 index 8b042515a..000000000 --- a/ext/ply/test/lex_state2.exp +++ /dev/null @@ -1,8 +0,0 @@ -lex: invalid state specifier 'comment'. Must be a tuple (statename,'exclusive|inclusive') -lex: invalid state specifier 'example'. Must be a tuple (statename,'exclusive|inclusive') -Traceback (most recent call last): - File "./lex_state2.py", line 38, in <module> - lex.lex() - File "../ply/lex.py", line 759, in lex - raise SyntaxError,"lex: Unable to build lexer." -SyntaxError: lex: Unable to build lexer. diff --git a/ext/ply/test/lex_state2.py b/ext/ply/test/lex_state2.py index 3053c7110..3aef69ea2 100644 --- a/ext/ply/test/lex_state2.py +++ b/ext/ply/test/lex_state2.py @@ -3,11 +3,11 @@ # Bad state declaration import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex -tokens = [ +tokens = [ "PLUS", "MINUS", "NUMBER", @@ -23,17 +23,17 @@ t_NUMBER = r'\d+' def t_comment(t): r'/\*' t.lexer.begin('comment') - print "Entering comment state" + print("Entering comment state") def t_comment_body_part(t): r'(.|\n)*\*/' - print "comment body", t + print("comment body %s" % t) t.lexer.begin('INITIAL') def t_error(t): pass -import sys + lex.lex() diff --git a/ext/ply/test/lex_state3.exp b/ext/ply/test/lex_state3.exp deleted file mode 100644 index 53ab57ff1..000000000 --- a/ext/ply/test/lex_state3.exp +++ /dev/null @@ -1,8 +0,0 @@ -lex: state name 1 must be a string -lex: No rules defined for state 'example' -Traceback (most recent call last): - File "./lex_state3.py", line 40, in <module> - lex.lex() - File "../ply/lex.py", line 759, in lex - raise SyntaxError,"lex: Unable to build lexer." -SyntaxError: lex: Unable to build lexer. diff --git a/ext/ply/test/lex_state3.py b/ext/ply/test/lex_state3.py index bb22d241e..616e48474 100644 --- a/ext/ply/test/lex_state3.py +++ b/ext/ply/test/lex_state3.py @@ -1,13 +1,13 @@ -# lex_state2.py +# lex_state3.py # # Bad state declaration import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex -tokens = [ +tokens = [ "PLUS", "MINUS", "NUMBER", @@ -25,17 +25,17 @@ t_NUMBER = r'\d+' def t_comment(t): r'/\*' t.lexer.begin('comment') - print "Entering comment state" + print("Entering comment state") def t_comment_body_part(t): r'(.|\n)*\*/' - print "comment body", t + print("comment body %s" % t) t.lexer.begin('INITIAL') def t_error(t): pass -import sys + lex.lex() diff --git a/ext/ply/test/lex_state4.exp b/ext/ply/test/lex_state4.exp deleted file mode 100644 index 412ae8f8a..000000000 --- a/ext/ply/test/lex_state4.exp +++ /dev/null @@ -1,7 +0,0 @@ -lex: state type for state comment must be 'inclusive' or 'exclusive' -Traceback (most recent call last): - File "./lex_state4.py", line 39, in <module> - lex.lex() - File "../ply/lex.py", line 759, in lex - raise SyntaxError,"lex: Unable to build lexer." -SyntaxError: lex: Unable to build lexer. diff --git a/ext/ply/test/lex_state4.py b/ext/ply/test/lex_state4.py index 3815135b4..182501614 100644 --- a/ext/ply/test/lex_state4.py +++ b/ext/ply/test/lex_state4.py @@ -1,19 +1,19 @@ -# lex_state2.py +# lex_state4.py # # Bad state declaration import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex -tokens = [ +tokens = [ "PLUS", "MINUS", "NUMBER", ] -comment = 1 + states = (('comment', 'exclsive'),) t_PLUS = r'\+' @@ -24,17 +24,17 @@ t_NUMBER = r'\d+' def t_comment(t): r'/\*' t.lexer.begin('comment') - print "Entering comment state" + print("Entering comment state") def t_comment_body_part(t): r'(.|\n)*\*/' - print "comment body", t + print("comment body %s" % t) t.lexer.begin('INITIAL') def t_error(t): pass -import sys + lex.lex() diff --git a/ext/ply/test/lex_state5.exp b/ext/ply/test/lex_state5.exp deleted file mode 100644 index 8eeae5641..000000000 --- a/ext/ply/test/lex_state5.exp +++ /dev/null @@ -1,7 +0,0 @@ -lex: state 'comment' already defined. -Traceback (most recent call last): - File "./lex_state5.py", line 40, in <module> - lex.lex() - File "../ply/lex.py", line 759, in lex - raise SyntaxError,"lex: Unable to build lexer." -SyntaxError: lex: Unable to build lexer. diff --git a/ext/ply/test/lex_state5.py b/ext/ply/test/lex_state5.py index 58718538c..4ce828e4f 100644 --- a/ext/ply/test/lex_state5.py +++ b/ext/ply/test/lex_state5.py @@ -1,19 +1,18 @@ -# lex_state2.py +# lex_state5.py # # Bad state declaration import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex -tokens = [ +tokens = [ "PLUS", "MINUS", "NUMBER", ] -comment = 1 states = (('comment', 'exclusive'), ('comment', 'exclusive')) @@ -25,17 +24,16 @@ t_NUMBER = r'\d+' def t_comment(t): r'/\*' t.lexer.begin('comment') - print "Entering comment state" + print("Entering comment state") def t_comment_body_part(t): r'(.|\n)*\*/' - print "comment body", t + print("comment body %s" % t) t.lexer.begin('INITIAL') def t_error(t): pass -import sys lex.lex() diff --git a/ext/ply/test/lex_state_noerror.exp b/ext/ply/test/lex_state_noerror.exp deleted file mode 100644 index e14149f18..000000000 --- a/ext/ply/test/lex_state_noerror.exp +++ /dev/null @@ -1 +0,0 @@ -lex: Warning. no error rule is defined for exclusive state 'comment' diff --git a/ext/ply/test/lex_state_noerror.py b/ext/ply/test/lex_state_noerror.py index 3fda7da49..90bbea878 100644 --- a/ext/ply/test/lex_state_noerror.py +++ b/ext/ply/test/lex_state_noerror.py @@ -1,19 +1,18 @@ -# lex_state2.py +# lex_state_noerror.py # # Declaration of a state for which no rules are defined import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex -tokens = [ +tokens = [ "PLUS", "MINUS", "NUMBER", ] -comment = 1 states = (('comment', 'exclusive'),) t_PLUS = r'\+' @@ -24,17 +23,16 @@ t_NUMBER = r'\d+' def t_comment(t): r'/\*' t.lexer.begin('comment') - print "Entering comment state" + print("Entering comment state") def t_comment_body_part(t): r'(.|\n)*\*/' - print "comment body", t + print("comment body %s" % t) t.lexer.begin('INITIAL') def t_error(t): pass -import sys lex.lex() diff --git a/ext/ply/test/lex_state_norule.exp b/ext/ply/test/lex_state_norule.exp deleted file mode 100644 index 7097d2a3a..000000000 --- a/ext/ply/test/lex_state_norule.exp +++ /dev/null @@ -1,7 +0,0 @@ -lex: No rules defined for state 'example' -Traceback (most recent call last): - File "./lex_state_norule.py", line 40, in <module> - lex.lex() - File "../ply/lex.py", line 759, in lex - raise SyntaxError,"lex: Unable to build lexer." -SyntaxError: lex: Unable to build lexer. diff --git a/ext/ply/test/lex_state_norule.py b/ext/ply/test/lex_state_norule.py index 2f6cabc51..64ec6d3ec 100644 --- a/ext/ply/test/lex_state_norule.py +++ b/ext/ply/test/lex_state_norule.py @@ -1,19 +1,18 @@ -# lex_state2.py +# lex_state_norule.py # # Declaration of a state for which no rules are defined import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex -tokens = [ +tokens = [ "PLUS", "MINUS", "NUMBER", ] -comment = 1 states = (('comment', 'exclusive'), ('example', 'exclusive')) @@ -25,17 +24,16 @@ t_NUMBER = r'\d+' def t_comment(t): r'/\*' t.lexer.begin('comment') - print "Entering comment state" + print("Entering comment state") def t_comment_body_part(t): r'(.|\n)*\*/' - print "comment body", t + print("comment body %s" % t) t.lexer.begin('INITIAL') def t_error(t): pass -import sys lex.lex() diff --git a/ext/ply/test/lex_state_try.exp b/ext/ply/test/lex_state_try.exp deleted file mode 100644 index 11768b893..000000000 --- a/ext/ply/test/lex_state_try.exp +++ /dev/null @@ -1,7 +0,0 @@ -(NUMBER,'3',1,0) -(PLUS,'+',1,2) -(NUMBER,'4',1,4) -Entering comment state -comment body LexToken(body_part,'This is a comment */',1,9) -(PLUS,'+',1,30) -(NUMBER,'10',1,32) diff --git a/ext/ply/test/lex_state_try.py b/ext/ply/test/lex_state_try.py index a2206cbc3..fd5ba2221 100644 --- a/ext/ply/test/lex_state_try.py +++ b/ext/ply/test/lex_state_try.py @@ -1,19 +1,18 @@ -# lex_state2.py +# lex_state_try.py # # Declaration of a state for which no rules are defined import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex -tokens = [ +tokens = [ "PLUS", "MINUS", "NUMBER", ] -comment = 1 states = (('comment', 'exclusive'),) t_PLUS = r'\+' @@ -26,11 +25,11 @@ t_ignore = " \t" def t_comment(t): r'/\*' t.lexer.begin('comment') - print "Entering comment state" + print("Entering comment state") def t_comment_body_part(t): r'(.|\n)*\*/' - print "comment body", t + print("comment body %s" % t) t.lexer.begin('INITIAL') def t_error(t): @@ -39,8 +38,6 @@ def t_error(t): t_comment_error = t_error t_comment_ignore = t_ignore -import sys - lex.lex() data = "3 + 4 /* This is a comment */ + 10" diff --git a/ext/ply/test/lex_token1.exp b/ext/ply/test/lex_token1.exp deleted file mode 100644 index 3792831fa..000000000 --- a/ext/ply/test/lex_token1.exp +++ /dev/null @@ -1 +0,0 @@ -SyntaxError: lex: module does not define 'tokens' diff --git a/ext/ply/test/lex_token1.py b/ext/ply/test/lex_token1.py index 380c31ce1..6fca300b1 100644 --- a/ext/ply/test/lex_token1.py +++ b/ext/ply/test/lex_token1.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_token1.py # # Tests for absence of tokens variable import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -14,8 +14,6 @@ t_NUMBER = r'\d+' def t_error(t): pass -sys.tracebacklimit = 0 - lex.lex() diff --git a/ext/ply/test/lex_token2.exp b/ext/ply/test/lex_token2.exp deleted file mode 100644 index 3f98fe51d..000000000 --- a/ext/ply/test/lex_token2.exp +++ /dev/null @@ -1 +0,0 @@ -SyntaxError: lex: tokens must be a list or tuple. diff --git a/ext/ply/test/lex_token2.py b/ext/ply/test/lex_token2.py index 87db8a0ab..6e65ab0f9 100644 --- a/ext/ply/test/lex_token2.py +++ b/ext/ply/test/lex_token2.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_token2.py # # Tests for tokens of wrong type import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -16,7 +16,6 @@ t_NUMBER = r'\d+' def t_error(t): pass -sys.tracebacklimit = 0 lex.lex() diff --git a/ext/ply/test/lex_token3.exp b/ext/ply/test/lex_token3.exp deleted file mode 100644 index d991d3c37..000000000 --- a/ext/ply/test/lex_token3.exp +++ /dev/null @@ -1,2 +0,0 @@ -lex: Rule 't_MINUS' defined for an unspecified token MINUS. -SyntaxError: lex: Unable to build lexer. diff --git a/ext/ply/test/lex_token3.py b/ext/ply/test/lex_token3.py index 27ce9476d..636452ea4 100644 --- a/ext/ply/test/lex_token3.py +++ b/ext/ply/test/lex_token3.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_token3.py # # tokens is right type, but is missing a token for one rule import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -19,9 +19,6 @@ t_NUMBER = r'\d+' def t_error(t): pass - -sys.tracebacklimit = 0 - lex.lex() diff --git a/ext/ply/test/lex_token4.exp b/ext/ply/test/lex_token4.exp deleted file mode 100644 index 3dd88e05a..000000000 --- a/ext/ply/test/lex_token4.exp +++ /dev/null @@ -1,2 +0,0 @@ -lex: Bad token name '-' -SyntaxError: lex: Unable to build lexer. diff --git a/ext/ply/test/lex_token4.py b/ext/ply/test/lex_token4.py index 612ff13c2..52947e9cc 100644 --- a/ext/ply/test/lex_token4.py +++ b/ext/ply/test/lex_token4.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_token4.py # # Bad token name import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -21,8 +21,6 @@ t_NUMBER = r'\d+' def t_error(t): pass -sys.tracebacklimit = 0 - lex.lex() diff --git a/ext/ply/test/lex_token5.exp b/ext/ply/test/lex_token5.exp deleted file mode 100644 index 2f038890a..000000000 --- a/ext/ply/test/lex_token5.exp +++ /dev/null @@ -1 +0,0 @@ -ply.lex.LexError: ./lex_token5.py:19: Rule 't_NUMBER' returned an unknown token type 'NUM' diff --git a/ext/ply/test/lex_token5.py b/ext/ply/test/lex_token5.py index 77fabdee9..ef7a3c502 100644 --- a/ext/ply/test/lex_token5.py +++ b/ext/ply/test/lex_token5.py @@ -1,9 +1,9 @@ -# lex_token.py +# lex_token5.py # # Return a bad token name import sys -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex @@ -24,8 +24,6 @@ def t_NUMBER(t): def t_error(t): pass -sys.tracebacklimit = 0 - lex.lex() lex.input("1234") t = lex.token() diff --git a/ext/ply/test/lex_token_dup.py b/ext/ply/test/lex_token_dup.py new file mode 100644 index 000000000..384f4e9db --- /dev/null +++ b/ext/ply/test/lex_token_dup.py @@ -0,0 +1,29 @@ +# lex_token_dup.py +# +# Duplicate token name in tokens + +import sys +if ".." not in sys.path: sys.path.insert(0,"..") + +import ply.lex as lex + +tokens = [ + "PLUS", + "MINUS", + "NUMBER", + "MINUS" + ] + +t_PLUS = r'\+' +t_MINUS = r'-' + +def t_NUMBER(t): + r'\d+' + return t + +def t_error(t): + pass + +lex.lex() + + diff --git a/ext/ply/test/rununit.py b/ext/ply/test/rununit.py deleted file mode 100644 index cb7a2298b..000000000 --- a/ext/ply/test/rununit.py +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python -'''Script to run all tests using python "unittest" module''' - -__author__ = "Miki Tebeka <miki.tebeka@zoran.com>" - -from unittest import TestCase, main, makeSuite, TestSuite -from os import popen, environ, remove -from glob import glob -from sys import executable, argv -from os.path import isfile, basename, splitext - -# Add path to lex.py and yacc.py -environ["PYTHONPATH"] = ".." - -class PLYTest(TestCase): - '''General test case for PLY test''' - def _runtest(self, filename): - '''Run a single test file an compare result''' - exp_file = filename.replace(".py", ".exp") - self.failUnless(isfile(exp_file), "can't find %s" % exp_file) - pipe = popen("%s %s 2>&1" % (executable, filename)) - out = pipe.read().strip() - self.failUnlessEqual(out, open(exp_file).read().strip()) - - -class LexText(PLYTest): - '''Testing Lex''' - pass - -class YaccTest(PLYTest): - '''Testing Yacc''' - - def tearDown(self): - '''Cleanup parsetab.py[c] file''' - for ext in (".py", ".pyc"): - fname = "parsetab%s" % ext - if isfile(fname): - remove(fname) - -def add_test(klass, filename): - '''Add a test to TestCase class''' - def t(self): - self._runtest(filename) - # Test name is test_FILENAME without the ./ and without the .py - setattr(klass, "test_%s" % (splitext(basename(filename))[0]), t) - -# Add lex tests -for file in glob("./lex_*.py"): - add_test(LexText, file) -lex_suite = makeSuite(LexText, "test_") - -# Add yacc tests -for file in glob("./yacc_*.py"): - add_test(YaccTest, file) -yacc_suite = makeSuite(YaccTest, "test_") - -# All tests suite -test_suite = TestSuite((lex_suite, yacc_suite)) - -if __name__ == "__main__": - main() - diff --git a/ext/ply/test/testlex.py b/ext/ply/test/testlex.py index df000b83d..606387d1d 100755 --- a/ext/ply/test/testlex.py +++ b/ext/ply/test/testlex.py @@ -1,57 +1,581 @@ -#!/usr/local/bin -# ---------------------------------------------------------------------- # testlex.py -# -# Run tests for the lexing module -# ---------------------------------------------------------------------- -import sys,os,glob +import unittest +try: + import StringIO +except ImportError: + import io as StringIO -if len(sys.argv) < 2: - print "Usage: python testlex.py directory" - raise SystemExit +import sys +sys.path.insert(0,"..") +sys.tracebacklimit = 0 -dirname = None -make = 0 +import ply.lex -for o in sys.argv[1:]: - if o == '-make': - make = 1 - else: - dirname = o - break +def check_expected(result,expected): + if sys.version_info[0] >= 3: + if isinstance(result,str): + result = result.encode('ascii') + if isinstance(expected,str): + expected = expected.encode('ascii') + resultlines = result.splitlines() + expectedlines = expected.splitlines() -if not dirname: - print "Usage: python testlex.py [-make] directory" - raise SystemExit -f = glob.glob("%s/%s" % (dirname,"lex_*.py")) + if len(resultlines) != len(expectedlines): + return False -print "**** Running tests for lex ****" + for rline,eline in zip(resultlines,expectedlines): + if not rline.endswith(eline): + return False + return True -for t in f: - name = t[:-3] - print "Testing %-32s" % name, - if make: - if not os.path.exists("%s.exp" % name): - os.system("python %s.py >%s.exp 2>&1" % (name,name)) - passed = 1 - else: - os.system("python %s.py >%s.out 2>&1" % (name,name)) - a = os.system("diff %s.out %s.exp >%s.dif" % (name,name,name)) - if a == 0: - passed = 1 - else: - passed = 0 +def run_import(module): + code = "import "+module + exec(code) + del sys.modules[module] + +# Tests related to errors and warnings when building lexers +class LexErrorWarningTests(unittest.TestCase): + def setUp(self): + sys.stderr = StringIO.StringIO() + sys.stdout = StringIO.StringIO() + def tearDown(self): + sys.stderr = sys.__stderr__ + sys.stdout = sys.__stdout__ + def test_lex_doc1(self): + self.assertRaises(SyntaxError,run_import,"lex_doc1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "lex_doc1.py:18: No regular expression defined for rule 't_NUMBER'\n")) + def test_lex_dup1(self): + self.assertRaises(SyntaxError,run_import,"lex_dup1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "lex_dup1.py:20: Rule t_NUMBER redefined. Previously defined on line 18\n" )) + + def test_lex_dup2(self): + self.assertRaises(SyntaxError,run_import,"lex_dup2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "lex_dup2.py:22: Rule t_NUMBER redefined. Previously defined on line 18\n" )) + + def test_lex_dup3(self): + self.assertRaises(SyntaxError,run_import,"lex_dup3") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "lex_dup3.py:20: Rule t_NUMBER redefined. Previously defined on line 18\n" )) - if passed: - print "Passed" - else: - print "Failed. See %s.dif" % name + def test_lex_empty(self): + self.assertRaises(SyntaxError,run_import,"lex_empty") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "No rules of the form t_rulename are defined\n" + "No rules defined for state 'INITIAL'\n")) + def test_lex_error1(self): + run_import("lex_error1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "No t_error rule is defined\n")) + def test_lex_error2(self): + self.assertRaises(SyntaxError,run_import,"lex_error2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Rule 't_error' must be defined as a function\n") + ) + def test_lex_error3(self): + self.assertRaises(SyntaxError,run_import,"lex_error3") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "lex_error3.py:20: Rule 't_error' requires an argument\n")) + def test_lex_error4(self): + self.assertRaises(SyntaxError,run_import,"lex_error4") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "lex_error4.py:20: Rule 't_error' has too many arguments\n")) + def test_lex_ignore(self): + self.assertRaises(SyntaxError,run_import,"lex_ignore") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "lex_ignore.py:20: Rule 't_ignore' must be defined as a string\n")) + def test_lex_ignore2(self): + run_import("lex_ignore2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "t_ignore contains a literal backslash '\\'\n")) + + def test_lex_re1(self): + self.assertRaises(SyntaxError,run_import,"lex_re1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Invalid regular expression for rule 't_NUMBER'. unbalanced parenthesis\n")) + + def test_lex_re2(self): + self.assertRaises(SyntaxError,run_import,"lex_re2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Regular expression for rule 't_PLUS' matches empty string\n")) + + def test_lex_re3(self): + self.assertRaises(SyntaxError,run_import,"lex_re3") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Invalid regular expression for rule 't_POUND'. unbalanced parenthesis\n" + "Make sure '#' in rule 't_POUND' is escaped with '\\#'\n")) + + def test_lex_rule1(self): + self.assertRaises(SyntaxError,run_import,"lex_rule1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "t_NUMBER not defined as a function or string\n")) + + def test_lex_rule2(self): + self.assertRaises(SyntaxError,run_import,"lex_rule2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "lex_rule2.py:18: Rule 't_NUMBER' requires an argument\n")) + + def test_lex_rule3(self): + self.assertRaises(SyntaxError,run_import,"lex_rule3") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "lex_rule3.py:18: Rule 't_NUMBER' has too many arguments\n")) + + + def test_lex_state1(self): + self.assertRaises(SyntaxError,run_import,"lex_state1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "states must be defined as a tuple or list\n")) + + def test_lex_state2(self): + self.assertRaises(SyntaxError,run_import,"lex_state2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Invalid state specifier 'comment'. Must be a tuple (statename,'exclusive|inclusive')\n" + "Invalid state specifier 'example'. Must be a tuple (statename,'exclusive|inclusive')\n")) + + def test_lex_state3(self): + self.assertRaises(SyntaxError,run_import,"lex_state3") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "State name 1 must be a string\n" + "No rules defined for state 'example'\n")) + + def test_lex_state4(self): + self.assertRaises(SyntaxError,run_import,"lex_state4") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "State type for state comment must be 'inclusive' or 'exclusive'\n")) + + + def test_lex_state5(self): + self.assertRaises(SyntaxError,run_import,"lex_state5") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "State 'comment' already defined\n")) + + def test_lex_state_noerror(self): + run_import("lex_state_noerror") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "No error rule is defined for exclusive state 'comment'\n")) + + def test_lex_state_norule(self): + self.assertRaises(SyntaxError,run_import,"lex_state_norule") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "No rules defined for state 'example'\n")) + + def test_lex_token1(self): + self.assertRaises(SyntaxError,run_import,"lex_token1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "No token list is defined\n" + "Rule 't_NUMBER' defined for an unspecified token NUMBER\n" + "Rule 't_PLUS' defined for an unspecified token PLUS\n" + "Rule 't_MINUS' defined for an unspecified token MINUS\n" +)) + + def test_lex_token2(self): + self.assertRaises(SyntaxError,run_import,"lex_token2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "tokens must be a list or tuple\n" + "Rule 't_NUMBER' defined for an unspecified token NUMBER\n" + "Rule 't_PLUS' defined for an unspecified token PLUS\n" + "Rule 't_MINUS' defined for an unspecified token MINUS\n" +)) + + def test_lex_token3(self): + self.assertRaises(SyntaxError,run_import,"lex_token3") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Rule 't_MINUS' defined for an unspecified token MINUS\n")) + + + def test_lex_token4(self): + self.assertRaises(SyntaxError,run_import,"lex_token4") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Bad token name '-'\n")) + + + def test_lex_token5(self): + try: + run_import("lex_token5") + except ply.lex.LexError: + e = sys.exc_info()[1] + self.assert_(check_expected(str(e),"lex_token5.py:19: Rule 't_NUMBER' returned an unknown token type 'NUM'")) + + def test_lex_token_dup(self): + run_import("lex_token_dup") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Token 'MINUS' multiply defined\n")) + + + def test_lex_literal1(self): + self.assertRaises(SyntaxError,run_import,"lex_literal1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Invalid literal '**'. Must be a single character\n")) + + def test_lex_literal2(self): + self.assertRaises(SyntaxError,run_import,"lex_literal2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Invalid literals specification. literals must be a sequence of characters\n")) + +import os +import subprocess +import shutil + +# Tests related to various build options associated with lexers +class LexBuildOptionTests(unittest.TestCase): + def setUp(self): + sys.stderr = StringIO.StringIO() + sys.stdout = StringIO.StringIO() + def tearDown(self): + sys.stderr = sys.__stderr__ + sys.stdout = sys.__stdout__ + try: + shutil.rmtree("lexdir") + except OSError: + pass + + def test_lex_module(self): + run_import("lex_module") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + + def test_lex_object(self): + run_import("lex_object") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + + def test_lex_closure(self): + run_import("lex_closure") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + def test_lex_optimize(self): + try: + os.remove("lextab.py") + except OSError: + pass + try: + os.remove("lextab.pyc") + except OSError: + pass + try: + os.remove("lextab.pyo") + except OSError: + pass + run_import("lex_optimize") + + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + self.assert_(os.path.exists("lextab.py")) + + + p = subprocess.Popen([sys.executable,'-O','lex_optimize.py'], + stdout=subprocess.PIPE) + result = p.stdout.read() + + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + self.assert_(os.path.exists("lextab.pyo")) + + os.remove("lextab.pyo") + p = subprocess.Popen([sys.executable,'-OO','lex_optimize.py'], + stdout=subprocess.PIPE) + result = p.stdout.read() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + self.assert_(os.path.exists("lextab.pyo")) + try: + os.remove("lextab.py") + except OSError: + pass + try: + os.remove("lextab.pyc") + except OSError: + pass + try: + os.remove("lextab.pyo") + except OSError: + pass + + def test_lex_optimize2(self): + try: + os.remove("opt2tab.py") + except OSError: + pass + try: + os.remove("opt2tab.pyc") + except OSError: + pass + try: + os.remove("opt2tab.pyo") + except OSError: + pass + run_import("lex_optimize2") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + self.assert_(os.path.exists("opt2tab.py")) + + p = subprocess.Popen([sys.executable,'-O','lex_optimize2.py'], + stdout=subprocess.PIPE) + result = p.stdout.read() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + self.assert_(os.path.exists("opt2tab.pyo")) + os.remove("opt2tab.pyo") + p = subprocess.Popen([sys.executable,'-OO','lex_optimize2.py'], + stdout=subprocess.PIPE) + result = p.stdout.read() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + self.assert_(os.path.exists("opt2tab.pyo")) + try: + os.remove("opt2tab.py") + except OSError: + pass + try: + os.remove("opt2tab.pyc") + except OSError: + pass + try: + os.remove("opt2tab.pyo") + except OSError: + pass + + def test_lex_optimize3(self): + try: + shutil.rmtree("lexdir") + except OSError: + pass + + os.mkdir("lexdir") + os.mkdir("lexdir/sub") + open("lexdir/__init__.py","w").write("") + open("lexdir/sub/__init__.py","w").write("") + run_import("lex_optimize3") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + self.assert_(os.path.exists("lexdir/sub/calctab.py")) + + p = subprocess.Popen([sys.executable,'-O','lex_optimize3.py'], + stdout=subprocess.PIPE) + result = p.stdout.read() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + self.assert_(os.path.exists("lexdir/sub/calctab.pyo")) + os.remove("lexdir/sub/calctab.pyo") + p = subprocess.Popen([sys.executable,'-OO','lex_optimize3.py'], + stdout=subprocess.PIPE) + result = p.stdout.read() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(PLUS,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + self.assert_(os.path.exists("lexdir/sub/calctab.pyo")) + try: + shutil.rmtree("lexdir") + except OSError: + pass + + def test_lex_opt_alias(self): + try: + os.remove("aliastab.py") + except OSError: + pass + try: + os.remove("aliastab.pyc") + except OSError: + pass + try: + os.remove("aliastab.pyo") + except OSError: + pass + run_import("lex_opt_alias") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(+,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + self.assert_(os.path.exists("aliastab.py")) + + p = subprocess.Popen([sys.executable,'-O','lex_opt_alias.py'], + stdout=subprocess.PIPE) + result = p.stdout.read() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(+,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + self.assert_(os.path.exists("aliastab.pyo")) + os.remove("aliastab.pyo") + p = subprocess.Popen([sys.executable,'-OO','lex_opt_alias.py'], + stdout=subprocess.PIPE) + result = p.stdout.read() + self.assert_(check_expected(result, + "(NUMBER,3,1,0)\n" + "(+,'+',1,1)\n" + "(NUMBER,4,1,2)\n")) + self.assert_(os.path.exists("aliastab.pyo")) + try: + os.remove("aliastab.py") + except OSError: + pass + try: + os.remove("aliastab.pyc") + except OSError: + pass + try: + os.remove("aliastab.pyo") + except OSError: + pass + + def test_lex_many_tokens(self): + try: + os.remove("manytab.py") + except OSError: + pass + try: + os.remove("manytab.pyc") + except OSError: + pass + try: + os.remove("manytab.pyo") + except OSError: + pass + run_import("lex_many_tokens") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(TOK34,'TOK34:',1,0)\n" + "(TOK143,'TOK143:',1,7)\n" + "(TOK269,'TOK269:',1,15)\n" + "(TOK372,'TOK372:',1,23)\n" + "(TOK452,'TOK452:',1,31)\n" + "(TOK561,'TOK561:',1,39)\n" + "(TOK999,'TOK999:',1,47)\n" + )) + + self.assert_(os.path.exists("manytab.py")) + + p = subprocess.Popen([sys.executable,'-O','lex_many_tokens.py'], + stdout=subprocess.PIPE) + result = p.stdout.read() + self.assert_(check_expected(result, + "(TOK34,'TOK34:',1,0)\n" + "(TOK143,'TOK143:',1,7)\n" + "(TOK269,'TOK269:',1,15)\n" + "(TOK372,'TOK372:',1,23)\n" + "(TOK452,'TOK452:',1,31)\n" + "(TOK561,'TOK561:',1,39)\n" + "(TOK999,'TOK999:',1,47)\n" + )) + + self.assert_(os.path.exists("manytab.pyo")) + os.remove("manytab.pyo") + try: + os.remove("manytab.py") + except OSError: + pass + try: + os.remove("manytab.pyc") + except OSError: + pass + try: + os.remove("manytab.pyo") + except OSError: + pass + +# Tests related to run-time behavior of lexers +class LexRunTests(unittest.TestCase): + def setUp(self): + sys.stderr = StringIO.StringIO() + sys.stdout = StringIO.StringIO() + def tearDown(self): + sys.stderr = sys.__stderr__ + sys.stdout = sys.__stdout__ + + def test_lex_hedit(self): + run_import("lex_hedit") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(H_EDIT_DESCRIPTOR,'abc',1,0)\n" + "(H_EDIT_DESCRIPTOR,'abcdefghij',1,6)\n" + "(H_EDIT_DESCRIPTOR,'xy',1,20)\n")) + + def test_lex_state_try(self): + run_import("lex_state_try") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "(NUMBER,'3',1,0)\n" + "(PLUS,'+',1,2)\n" + "(NUMBER,'4',1,4)\n" + "Entering comment state\n" + "comment body LexToken(body_part,'This is a comment */',1,9)\n" + "(PLUS,'+',1,30)\n" + "(NUMBER,'10',1,32)\n" + )) + + + +unittest.main() diff --git a/ext/ply/test/testyacc.py b/ext/ply/test/testyacc.py index a185cbb29..cc53b6d8f 100644 --- a/ext/ply/test/testyacc.py +++ b/ext/ply/test/testyacc.py @@ -1,58 +1,324 @@ -#!/usr/local/bin -# ---------------------------------------------------------------------- # testyacc.py -# -# Run tests for the yacc module -# ---------------------------------------------------------------------- -import sys,os,glob +import unittest +try: + import StringIO +except ImportError: + import io as StringIO -if len(sys.argv) < 2: - print "Usage: python testyacc.py directory" - raise SystemExit +import sys +import os -dirname = None -make = 0 +sys.path.insert(0,"..") +sys.tracebacklimit = 0 -for o in sys.argv[1:]: - if o == '-make': - make = 1 - else: - dirname = o - break +import ply.yacc -if not dirname: - print "Usage: python testyacc.py [-make] directory" - raise SystemExit +def check_expected(result,expected): + resultlines = [] + for line in result.splitlines(): + if line.startswith("WARNING: "): + line = line[9:] + elif line.startswith("ERROR: "): + line = line[7:] + resultlines.append(line) -f = glob.glob("%s/%s" % (dirname,"yacc_*.py")) + expectedlines = expected.splitlines() + if len(resultlines) != len(expectedlines): + return False + for rline,eline in zip(resultlines,expectedlines): + if not rline.endswith(eline): + return False + return True -print "**** Running tests for yacc ****" +def run_import(module): + code = "import "+module + exec(code) + del sys.modules[module] + +# Tests related to errors and warnings when building parsers +class YaccErrorWarningTests(unittest.TestCase): + def setUp(self): + sys.stderr = StringIO.StringIO() + sys.stdout = StringIO.StringIO() + try: + os.remove("parsetab.py") + os.remove("parsetab.pyc") + except OSError: + pass + + def tearDown(self): + sys.stderr = sys.__stderr__ + sys.stdout = sys.__stdout__ + def test_yacc_badargs(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_badargs") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_badargs.py:23: Rule 'p_statement_assign' has too many arguments\n" + "yacc_badargs.py:27: Rule 'p_statement_expr' requires an argument\n" + )) + def test_yacc_badid(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_badid") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_badid.py:32: Illegal name 'bad&rule' in rule 'statement'\n" + "yacc_badid.py:36: Illegal rule name 'bad&rule'\n" + )) -for t in f: - name = t[:-3] - print "Testing %-32s" % name, - os.system("rm -f %s/parsetab.*" % dirname) - if make: - if not os.path.exists("%s.exp" % name): - os.system("python %s.py >%s.exp 2>&1" % (name,name)) - passed = 1 - else: - os.system("python %s.py >%s.out 2>&1" % (name,name)) - a = os.system("diff %s.out %s.exp >%s.dif" % (name,name,name)) - if a == 0: - passed = 1 - else: - passed = 0 + def test_yacc_badprec(self): + try: + run_import("yacc_badprec") + except ply.yacc.YaccError: + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "precedence must be a list or tuple\n" + )) + def test_yacc_badprec2(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_badprec2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Bad precedence table\n" + )) - if passed: - print "Passed" - else: - print "Failed. See %s.dif" % name + def test_yacc_badprec3(self): + run_import("yacc_badprec3") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Precedence already specified for terminal 'MINUS'\n" + "Generating LALR tables\n" + )) + + def test_yacc_badrule(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_badrule") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_badrule.py:24: Syntax error. Expected ':'\n" + "yacc_badrule.py:28: Syntax error in rule 'statement'\n" + "yacc_badrule.py:33: Syntax error. Expected ':'\n" + "yacc_badrule.py:42: Syntax error. Expected ':'\n" + )) + def test_yacc_badtok(self): + try: + run_import("yacc_badtok") + except ply.yacc.YaccError: + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "tokens must be a list or tuple\n")) + def test_yacc_dup(self): + run_import("yacc_dup") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_dup.py:27: Function p_statement redefined. Previously defined on line 23\n" + "Token 'EQUALS' defined, but not used\n" + "There is 1 unused token\n" + "Generating LALR tables\n" + )) + def test_yacc_error1(self): + try: + run_import("yacc_error1") + except ply.yacc.YaccError: + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_error1.py:61: p_error() requires 1 argument\n")) + def test_yacc_error2(self): + try: + run_import("yacc_error2") + except ply.yacc.YaccError: + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_error2.py:61: p_error() requires 1 argument\n")) + def test_yacc_error3(self): + try: + run_import("yacc_error3") + except ply.yacc.YaccError: + e = sys.exc_info()[1] + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "'p_error' defined, but is not a function or method\n")) + + def test_yacc_error4(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_error4") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_error4.py:62: Illegal rule name 'error'. Already defined as a token\n" + )) + + def test_yacc_inf(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_inf") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Token 'NUMBER' defined, but not used\n" + "There is 1 unused token\n" + "Infinite recursion detected for symbol 'statement'\n" + "Infinite recursion detected for symbol 'expression'\n" + )) + def test_yacc_literal(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_literal") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_literal.py:36: Literal token '**' in rule 'expression' may only be a single character\n" + )) + def test_yacc_misplaced(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_misplaced") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_misplaced.py:32: Misplaced '|'\n" + )) + def test_yacc_missing1(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_missing1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_missing1.py:24: Symbol 'location' used, but not defined as a token or a rule\n" + )) + + def test_yacc_nested(self): + run_import("yacc_nested") + result = sys.stdout.getvalue() + self.assert_(check_expected(result, + "A\n" + "A\n" + "A\n", + )) + + def test_yacc_nodoc(self): + run_import("yacc_nodoc") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_nodoc.py:27: No documentation string specified in function 'p_statement_expr' (ignored)\n" + "Generating LALR tables\n" + )) + + def test_yacc_noerror(self): + run_import("yacc_noerror") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "no p_error() function is defined\n" + "Generating LALR tables\n" + )) + + def test_yacc_nop(self): + run_import("yacc_nop") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_nop.py:27: Possible grammar rule 'statement_expr' defined without p_ prefix\n" + "Generating LALR tables\n" + )) + + def test_yacc_notfunc(self): + run_import("yacc_notfunc") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "'p_statement_assign' not defined as a function\n" + "Token 'EQUALS' defined, but not used\n" + "There is 1 unused token\n" + "Generating LALR tables\n" + )) + def test_yacc_notok(self): + try: + run_import("yacc_notok") + except ply.yacc.YaccError: + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "No token list is defined\n")) + + def test_yacc_rr(self): + run_import("yacc_rr") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Generating LALR tables\n" + "1 reduce/reduce conflict\n" + "reduce/reduce conflict in state 15 resolved using rule (statement -> NAME EQUALS NUMBER)\n" + "rejected rule (expression -> NUMBER) in state 15\n" + + )) + + def test_yacc_rr_unused(self): + run_import("yacc_rr_unused") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "no p_error() function is defined\n" + "Generating LALR tables\n" + "3 reduce/reduce conflicts\n" + "reduce/reduce conflict in state 1 resolved using rule (rule3 -> A)\n" + "rejected rule (rule4 -> A) in state 1\n" + "reduce/reduce conflict in state 1 resolved using rule (rule3 -> A)\n" + "rejected rule (rule5 -> A) in state 1\n" + "reduce/reduce conflict in state 1 resolved using rule (rule4 -> A)\n" + "rejected rule (rule5 -> A) in state 1\n" + "Rule (rule5 -> A) is never reduced\n" + )) + + def test_yacc_simple(self): + run_import("yacc_simple") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Generating LALR tables\n" + )) + def test_yacc_sr(self): + run_import("yacc_sr") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Generating LALR tables\n" + "20 shift/reduce conflicts\n" + )) + + def test_yacc_term1(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_term1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_term1.py:24: Illegal rule name 'NUMBER'. Already defined as a token\n" + )) + + def test_yacc_unused(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_unused") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_unused.py:62: Symbol 'COMMA' used, but not defined as a token or a rule\n" + "Symbol 'COMMA' is unreachable\n" + "Symbol 'exprlist' is unreachable\n" + )) + def test_yacc_unused_rule(self): + run_import("yacc_unused_rule") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_unused_rule.py:62: Rule 'integer' defined, but not used\n" + "There is 1 unused rule\n" + "Symbol 'integer' is unreachable\n" + "Generating LALR tables\n" + )) + + def test_yacc_uprec(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_uprec") + result = sys.stderr.getvalue() + print repr(result) + self.assert_(check_expected(result, + "yacc_uprec.py:37: Nothing known about the precedence of 'UMINUS'\n" + )) + + def test_yacc_uprec2(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_uprec2") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_uprec2.py:37: Syntax error. Nothing follows %prec\n" + )) + + def test_yacc_prec1(self): + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_prec1") + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "Precedence rule 'left' defined for unknown symbol '+'\n" + "Precedence rule 'left' defined for unknown symbol '*'\n" + "Precedence rule 'left' defined for unknown symbol '-'\n" + "Precedence rule 'left' defined for unknown symbol '/'\n" + )) + + + +unittest.main() diff --git a/ext/ply/test/yacc_badargs.exp b/ext/ply/test/yacc_badargs.exp deleted file mode 100644 index e99467659..000000000 --- a/ext/ply/test/yacc_badargs.exp +++ /dev/null @@ -1,3 +0,0 @@ -./yacc_badargs.py:23: Rule 'p_statement_assign' has too many arguments. -./yacc_badargs.py:27: Rule 'p_statement_expr' requires an argument. -ply.yacc.YaccError: Unable to construct parser. diff --git a/ext/ply/test/yacc_badargs.py b/ext/ply/test/yacc_badargs.py index 810e5298a..9a1d03f2c 100644 --- a/ext/ply/test/yacc_badargs.py +++ b/ext/ply/test/yacc_badargs.py @@ -26,7 +26,7 @@ def p_statement_assign(t,s): def p_statement_expr(): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -36,7 +36,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -55,11 +55,11 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(t): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_badid.py b/ext/ply/test/yacc_badid.py new file mode 100644 index 000000000..e4b9f5eeb --- /dev/null +++ b/ext/ply/test/yacc_badid.py @@ -0,0 +1,77 @@ +# ----------------------------------------------------------------------------- +# yacc_badid.py +# +# Attempt to define a rule with a bad-identifier name +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_statement_expr2(t): + 'statement : bad&rule' + pass + +def p_badrule(t): + 'bad&rule : expression' + pass + + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + pass + +yacc.yacc() + + + + diff --git a/ext/ply/test/yacc_badprec.exp b/ext/ply/test/yacc_badprec.exp deleted file mode 100644 index f4f574b99..000000000 --- a/ext/ply/test/yacc_badprec.exp +++ /dev/null @@ -1 +0,0 @@ -ply.yacc.YaccError: precedence must be a list or tuple. diff --git a/ext/ply/test/yacc_badprec.py b/ext/ply/test/yacc_badprec.py index 8f64652e6..3013bb621 100644 --- a/ext/ply/test/yacc_badprec.py +++ b/ext/ply/test/yacc_badprec.py @@ -4,9 +4,8 @@ # Bad precedence specifier # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -23,7 +22,7 @@ def p_statement_assign(t): def p_statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -33,7 +32,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -52,11 +51,11 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(t): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_badprec2.exp b/ext/ply/test/yacc_badprec2.exp deleted file mode 100644 index 8fac075ce..000000000 --- a/ext/ply/test/yacc_badprec2.exp +++ /dev/null @@ -1,3 +0,0 @@ -yacc: Invalid precedence table. -yacc: Generating LALR parsing table... -yacc: 8 shift/reduce conflicts diff --git a/ext/ply/test/yacc_badprec2.py b/ext/ply/test/yacc_badprec2.py index 206bda768..83093b42d 100644 --- a/ext/ply/test/yacc_badprec2.py +++ b/ext/ply/test/yacc_badprec2.py @@ -4,9 +4,8 @@ # Bad precedence # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -27,7 +26,7 @@ def p_statement_assign(t): def p_statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -37,7 +36,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -56,11 +55,11 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(t): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_badprec3.py b/ext/ply/test/yacc_badprec3.py new file mode 100644 index 000000000..d925ecd55 --- /dev/null +++ b/ext/ply/test/yacc_badprec3.py @@ -0,0 +1,68 @@ +# ----------------------------------------------------------------------------- +# yacc_badprec3.py +# +# Bad precedence +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE','MINUS'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[3] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/ext/ply/test/yacc_badrule.exp b/ext/ply/test/yacc_badrule.exp deleted file mode 100644 index a87bf7d68..000000000 --- a/ext/ply/test/yacc_badrule.exp +++ /dev/null @@ -1,5 +0,0 @@ -./yacc_badrule.py:25: Syntax error. Expected ':' -./yacc_badrule.py:29: Syntax error in rule 'statement' -./yacc_badrule.py:34: Syntax error. Expected ':' -./yacc_badrule.py:43: Syntax error. Expected ':' -ply.yacc.YaccError: Unable to construct parser. diff --git a/ext/ply/test/yacc_badrule.py b/ext/ply/test/yacc_badrule.py index f5fef8ad6..92af6460a 100644 --- a/ext/ply/test/yacc_badrule.py +++ b/ext/ply/test/yacc_badrule.py @@ -4,9 +4,8 @@ # Syntax problems in the rule strings # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -27,7 +26,7 @@ def p_statement_assign(t): def p_statement_expr(t): 'statement' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -37,7 +36,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression: MINUS expression %prec UMINUS' @@ -56,11 +55,11 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(t): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_badtok.exp b/ext/ply/test/yacc_badtok.exp deleted file mode 100644 index ccdc0e7a1..000000000 --- a/ext/ply/test/yacc_badtok.exp +++ /dev/null @@ -1 +0,0 @@ -ply.yacc.YaccError: tokens must be a list or tuple. diff --git a/ext/ply/test/yacc_badtok.py b/ext/ply/test/yacc_badtok.py index 4f2af5162..fc4afe19e 100644 --- a/ext/ply/test/yacc_badtok.py +++ b/ext/ply/test/yacc_badtok.py @@ -5,9 +5,7 @@ # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 - -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc tokens = "Hello" @@ -28,7 +26,7 @@ def p_statement_assign(t): def p_statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -38,7 +36,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -57,11 +55,11 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(t): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_dup.exp b/ext/ply/test/yacc_dup.exp deleted file mode 100644 index fdfb2103d..000000000 --- a/ext/ply/test/yacc_dup.exp +++ /dev/null @@ -1,4 +0,0 @@ -./yacc_dup.py:28: Function p_statement redefined. Previously defined on line 24 -yacc: Warning. Token 'EQUALS' defined, but not used. -yacc: Warning. There is 1 unused token. -yacc: Generating LALR parsing table... diff --git a/ext/ply/test/yacc_dup.py b/ext/ply/test/yacc_dup.py index e0b683d8f..309ba3299 100644 --- a/ext/ply/test/yacc_dup.py +++ b/ext/ply/test/yacc_dup.py @@ -4,9 +4,8 @@ # Duplicated rule name # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -27,7 +26,7 @@ def p_statement(t): def p_statement(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -37,7 +36,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -56,11 +55,11 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(t): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_error1.exp b/ext/ply/test/yacc_error1.exp deleted file mode 100644 index 13bed0461..000000000 --- a/ext/ply/test/yacc_error1.exp +++ /dev/null @@ -1 +0,0 @@ -ply.yacc.YaccError: ./yacc_error1.py:62: p_error() requires 1 argument. diff --git a/ext/ply/test/yacc_error1.py b/ext/ply/test/yacc_error1.py index 2768fc14a..10ac6a9cd 100644 --- a/ext/ply/test/yacc_error1.py +++ b/ext/ply/test/yacc_error1.py @@ -4,9 +4,8 @@ # Bad p_error() function # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -27,7 +26,7 @@ def p_statement_assign(t): def p_statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -37,7 +36,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -56,11 +55,11 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(t,s): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_error2.exp b/ext/ply/test/yacc_error2.exp deleted file mode 100644 index 4a7628d78..000000000 --- a/ext/ply/test/yacc_error2.exp +++ /dev/null @@ -1 +0,0 @@ -ply.yacc.YaccError: ./yacc_error2.py:62: p_error() requires 1 argument. diff --git a/ext/ply/test/yacc_error2.py b/ext/ply/test/yacc_error2.py index 8f3a05290..759141809 100644 --- a/ext/ply/test/yacc_error2.py +++ b/ext/ply/test/yacc_error2.py @@ -1,12 +1,11 @@ # ----------------------------------------------------------------------------- -# yacc_error1.py +# yacc_error2.py # # Bad p_error() function # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -27,7 +26,7 @@ def p_statement_assign(t): def p_statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -37,7 +36,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -56,11 +55,11 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_error3.exp b/ext/ply/test/yacc_error3.exp deleted file mode 100644 index 7fca2fe95..000000000 --- a/ext/ply/test/yacc_error3.exp +++ /dev/null @@ -1 +0,0 @@ -ply.yacc.YaccError: 'p_error' defined, but is not a function or method. diff --git a/ext/ply/test/yacc_error3.py b/ext/ply/test/yacc_error3.py index b387de5d1..4604a48bf 100644 --- a/ext/ply/test/yacc_error3.py +++ b/ext/ply/test/yacc_error3.py @@ -1,12 +1,11 @@ # ----------------------------------------------------------------------------- -# yacc_error1.py +# yacc_error3.py # # Bad p_error() function # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -27,7 +26,7 @@ def p_statement_assign(t): def p_statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -37,7 +36,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -56,7 +55,7 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 p_error = "blah" diff --git a/ext/ply/test/yacc_error4.py b/ext/ply/test/yacc_error4.py new file mode 100644 index 000000000..9c550cd83 --- /dev/null +++ b/ext/ply/test/yacc_error4.py @@ -0,0 +1,72 @@ +# ----------------------------------------------------------------------------- +# yacc_error4.py +# +# Attempt to define a rule named 'error' +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error_handler(t): + 'error : NAME' + pass + +def p_error(t): + pass + +yacc.yacc() + + + + diff --git a/ext/ply/test/yacc_inf.exp b/ext/ply/test/yacc_inf.exp deleted file mode 100644 index 88cfa4a2e..000000000 --- a/ext/ply/test/yacc_inf.exp +++ /dev/null @@ -1,5 +0,0 @@ -yacc: Warning. Token 'NUMBER' defined, but not used. -yacc: Warning. There is 1 unused token. -yacc: Infinite recursion detected for symbol 'statement'. -yacc: Infinite recursion detected for symbol 'expression'. -ply.yacc.YaccError: Unable to construct parser. diff --git a/ext/ply/test/yacc_inf.py b/ext/ply/test/yacc_inf.py index 9b9aef75d..efd3612a1 100644 --- a/ext/ply/test/yacc_inf.py +++ b/ext/ply/test/yacc_inf.py @@ -4,9 +4,8 @@ # Infinite recursion # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -27,7 +26,7 @@ def p_statement_assign(t): def p_statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -37,7 +36,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -48,7 +47,7 @@ def p_expression_group(t): t[0] = t[2] def p_error(t): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_literal.py b/ext/ply/test/yacc_literal.py new file mode 100644 index 000000000..0d628035b --- /dev/null +++ b/ext/ply/test/yacc_literal.py @@ -0,0 +1,69 @@ +# ----------------------------------------------------------------------------- +# yacc_literal.py +# +# Grammar with bad literal characters +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','+','-'), + ('left','*','/'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression '+' expression + | expression '-' expression + | expression '*' expression + | expression '/' expression + | expression '**' expression ''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/ext/ply/test/yacc_misplaced.py b/ext/ply/test/yacc_misplaced.py new file mode 100644 index 000000000..9159b0109 --- /dev/null +++ b/ext/ply/test/yacc_misplaced.py @@ -0,0 +1,68 @@ +# ----------------------------------------------------------------------------- +# yacc_misplaced.py +# +# A misplaced | in grammar rules +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + ''' | expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/ext/ply/test/yacc_missing1.exp b/ext/ply/test/yacc_missing1.exp deleted file mode 100644 index de63d4f48..000000000 --- a/ext/ply/test/yacc_missing1.exp +++ /dev/null @@ -1,2 +0,0 @@ -./yacc_missing1.py:25: Symbol 'location' used, but not defined as a token or a rule. -ply.yacc.YaccError: Unable to construct parser. diff --git a/ext/ply/test/yacc_missing1.py b/ext/ply/test/yacc_missing1.py index fbc54d8c5..d1b510592 100644 --- a/ext/ply/test/yacc_missing1.py +++ b/ext/ply/test/yacc_missing1.py @@ -4,9 +4,8 @@ # Grammar with a missing rule # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -27,7 +26,7 @@ def p_statement_assign(t): def p_statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -37,7 +36,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -56,11 +55,11 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(t): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_nested.py b/ext/ply/test/yacc_nested.py new file mode 100644 index 000000000..a1b061e78 --- /dev/null +++ b/ext/ply/test/yacc_nested.py @@ -0,0 +1,33 @@ +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") + +from ply import lex, yacc + +t_A = 'A' +t_B = 'B' +t_C = 'C' + +tokens = ('A', 'B', 'C') + +the_lexer = lex.lex() + +def t_error(t): + pass + +def p_error(p): + pass + +def p_start(t): + '''start : A nest C''' + pass + +def p_nest(t): + '''nest : B''' + print(t[-1]) + +the_parser = yacc.yacc(debug = False, write_tables = False) + +the_parser.parse('ABC', the_lexer) +the_parser.parse('ABC', the_lexer, tracking=True) +the_parser.parse('ABC', the_lexer, tracking=True, debug=1) diff --git a/ext/ply/test/yacc_nodoc.exp b/ext/ply/test/yacc_nodoc.exp deleted file mode 100644 index 889ccfce7..000000000 --- a/ext/ply/test/yacc_nodoc.exp +++ /dev/null @@ -1,2 +0,0 @@ -./yacc_nodoc.py:28: No documentation string specified in function 'p_statement_expr' -yacc: Generating LALR parsing table... diff --git a/ext/ply/test/yacc_nodoc.py b/ext/ply/test/yacc_nodoc.py index 4c5ab20a9..0f61920ab 100644 --- a/ext/ply/test/yacc_nodoc.py +++ b/ext/ply/test/yacc_nodoc.py @@ -4,9 +4,8 @@ # Rule with a missing doc-string # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -26,7 +25,7 @@ def p_statement_assign(t): names[t[1]] = t[3] def p_statement_expr(t): - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -36,7 +35,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -55,11 +54,11 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(t): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_noerror.exp b/ext/ply/test/yacc_noerror.exp deleted file mode 100644 index 3ae771225..000000000 --- a/ext/ply/test/yacc_noerror.exp +++ /dev/null @@ -1,2 +0,0 @@ -yacc: Warning. no p_error() function is defined. -yacc: Generating LALR parsing table... diff --git a/ext/ply/test/yacc_noerror.py b/ext/ply/test/yacc_noerror.py index 9c11838eb..b38c7581f 100644 --- a/ext/ply/test/yacc_noerror.py +++ b/ext/ply/test/yacc_noerror.py @@ -4,9 +4,8 @@ # No p_error() rule defined. # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -27,7 +26,7 @@ def p_statement_assign(t): def p_statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -37,7 +36,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -56,7 +55,7 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 diff --git a/ext/ply/test/yacc_nop.exp b/ext/ply/test/yacc_nop.exp deleted file mode 100644 index 515fff7dc..000000000 --- a/ext/ply/test/yacc_nop.exp +++ /dev/null @@ -1,2 +0,0 @@ -./yacc_nop.py:28: Warning. Possible grammar rule 'statement_expr' defined without p_ prefix. -yacc: Generating LALR parsing table... diff --git a/ext/ply/test/yacc_nop.py b/ext/ply/test/yacc_nop.py index c0b431d4b..789a9cfad 100644 --- a/ext/ply/test/yacc_nop.py +++ b/ext/ply/test/yacc_nop.py @@ -4,9 +4,8 @@ # Possible grammar rule defined without p_ prefix # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -27,7 +26,7 @@ def p_statement_assign(t): def statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -37,7 +36,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -56,11 +55,11 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(t): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_notfunc.exp b/ext/ply/test/yacc_notfunc.exp deleted file mode 100644 index f73bc93a5..000000000 --- a/ext/ply/test/yacc_notfunc.exp +++ /dev/null @@ -1,4 +0,0 @@ -yacc: Warning. 'p_statement_assign' not defined as a function -yacc: Warning. Token 'EQUALS' defined, but not used. -yacc: Warning. There is 1 unused token. -yacc: Generating LALR parsing table... diff --git a/ext/ply/test/yacc_notfunc.py b/ext/ply/test/yacc_notfunc.py index 838935509..5093a7448 100644 --- a/ext/ply/test/yacc_notfunc.py +++ b/ext/ply/test/yacc_notfunc.py @@ -4,9 +4,8 @@ # p_rule not defined as a function # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -25,7 +24,7 @@ p_statement_assign = "Blah" def p_statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -35,7 +34,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -54,11 +53,11 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(t): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_notok.exp b/ext/ply/test/yacc_notok.exp deleted file mode 100644 index d2399fe17..000000000 --- a/ext/ply/test/yacc_notok.exp +++ /dev/null @@ -1 +0,0 @@ -ply.yacc.YaccError: module does not define a list 'tokens' diff --git a/ext/ply/test/yacc_notok.py b/ext/ply/test/yacc_notok.py index e566a1bf4..cff55a8d0 100644 --- a/ext/ply/test/yacc_notok.py +++ b/ext/ply/test/yacc_notok.py @@ -5,9 +5,8 @@ # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc # Parsing rules @@ -26,7 +25,7 @@ def p_statement_assign(t): def p_statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -36,7 +35,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -55,11 +54,11 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(t): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_prec1.py b/ext/ply/test/yacc_prec1.py new file mode 100644 index 000000000..2ca6afc0b --- /dev/null +++ b/ext/ply/test/yacc_prec1.py @@ -0,0 +1,68 @@ +# ----------------------------------------------------------------------------- +# yacc_prec1.py +# +# Tests case where precedence specifier doesn't match up to terminals +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','+','-'), + ('left','*','/'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/ext/ply/test/yacc_rr.exp b/ext/ply/test/yacc_rr.exp deleted file mode 100644 index f73cefdec..000000000 --- a/ext/ply/test/yacc_rr.exp +++ /dev/null @@ -1,2 +0,0 @@ -yacc: Generating LALR parsing table... -yacc: 1 reduce/reduce conflict diff --git a/ext/ply/test/yacc_rr.py b/ext/ply/test/yacc_rr.py index bb8cba235..e7336c2f0 100644 --- a/ext/ply/test/yacc_rr.py +++ b/ext/ply/test/yacc_rr.py @@ -4,9 +4,8 @@ # A grammar with a reduce/reduce conflict # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -31,7 +30,7 @@ def p_statement_assign_2(t): def p_statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -41,7 +40,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -60,11 +59,11 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(t): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_rr_unused.py b/ext/ply/test/yacc_rr_unused.py new file mode 100644 index 000000000..1ca5f7e5b --- /dev/null +++ b/ext/ply/test/yacc_rr_unused.py @@ -0,0 +1,30 @@ +# ----------------------------------------------------------------------------- +# yacc_rr_unused.py +# +# A grammar with reduce/reduce conflicts and a rule that never +# gets reduced. +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +tokens = ('A', 'B', 'C') + +def p_grammar(p): + ''' + rule1 : rule2 B + | rule2 C + + rule2 : rule3 B + | rule4 + | rule5 + + rule3 : A + + rule4 : A + + rule5 : A + ''' + +yacc.yacc() diff --git a/ext/ply/test/yacc_simple.exp b/ext/ply/test/yacc_simple.exp deleted file mode 100644 index 38360315f..000000000 --- a/ext/ply/test/yacc_simple.exp +++ /dev/null @@ -1 +0,0 @@ -yacc: Generating LALR parsing table... diff --git a/ext/ply/test/yacc_simple.py b/ext/ply/test/yacc_simple.py index b5dc9f39c..bd989f4d6 100644 --- a/ext/ply/test/yacc_simple.py +++ b/ext/ply/test/yacc_simple.py @@ -4,9 +4,8 @@ # A simple, properly specifier grammar # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -27,7 +26,7 @@ def p_statement_assign(t): def p_statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -37,7 +36,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -56,11 +55,11 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(t): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_sr.exp b/ext/ply/test/yacc_sr.exp deleted file mode 100644 index 1b764502c..000000000 --- a/ext/ply/test/yacc_sr.exp +++ /dev/null @@ -1,2 +0,0 @@ -yacc: Generating LALR parsing table... -yacc: 20 shift/reduce conflicts diff --git a/ext/ply/test/yacc_sr.py b/ext/ply/test/yacc_sr.py index e2f03ec74..69a1e9c7f 100644 --- a/ext/ply/test/yacc_sr.py +++ b/ext/ply/test/yacc_sr.py @@ -4,9 +4,8 @@ # A grammar with shift-reduce conflicts # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -22,7 +21,7 @@ def p_statement_assign(t): def p_statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -32,7 +31,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression' @@ -51,11 +50,11 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(t): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_term1.exp b/ext/ply/test/yacc_term1.exp deleted file mode 100644 index 40f9bdf64..000000000 --- a/ext/ply/test/yacc_term1.exp +++ /dev/null @@ -1,2 +0,0 @@ -./yacc_term1.py:25: Illegal rule name 'NUMBER'. Already defined as a token. -ply.yacc.YaccError: Unable to construct parser. diff --git a/ext/ply/test/yacc_term1.py b/ext/ply/test/yacc_term1.py index bbc52da86..eaa36e9d6 100644 --- a/ext/ply/test/yacc_term1.py +++ b/ext/ply/test/yacc_term1.py @@ -4,9 +4,8 @@ # Terminal used on the left-hand-side # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -27,7 +26,7 @@ def p_statement_assign(t): def p_statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -37,7 +36,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -56,11 +55,11 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(t): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_unused.exp b/ext/ply/test/yacc_unused.exp deleted file mode 100644 index 6caafd266..000000000 --- a/ext/ply/test/yacc_unused.exp +++ /dev/null @@ -1,4 +0,0 @@ -./yacc_unused.py:63: Symbol 'COMMA' used, but not defined as a token or a rule. -yacc: Symbol 'COMMA' is unreachable. -yacc: Symbol 'exprlist' is unreachable. -ply.yacc.YaccError: Unable to construct parser. diff --git a/ext/ply/test/yacc_unused.py b/ext/ply/test/yacc_unused.py index 3a61f99cd..55b677b1f 100644 --- a/ext/ply/test/yacc_unused.py +++ b/ext/ply/test/yacc_unused.py @@ -4,9 +4,8 @@ # A grammar with an unused rule # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -27,7 +26,7 @@ def p_statement_assign(t): def p_statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -37,7 +36,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -56,7 +55,7 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_expr_list(t): @@ -69,7 +68,7 @@ def p_expr_list_2(t): def p_error(t): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_unused_rule.py b/ext/ply/test/yacc_unused_rule.py new file mode 100644 index 000000000..4868ef863 --- /dev/null +++ b/ext/ply/test/yacc_unused_rule.py @@ -0,0 +1,72 @@ +# ----------------------------------------------------------------------------- +# yacc_unused_rule.py +# +# Grammar with an unused rule +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules +precedence = ( + ('left','PLUS','MINUS'), + ('left','TIMES','DIVIDE'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec UMINUS' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_integer(t): + 'integer : NUMBER' + t[0] = t[1] + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + diff --git a/ext/ply/test/yacc_uprec.exp b/ext/ply/test/yacc_uprec.exp deleted file mode 100644 index eb9a39886..000000000 --- a/ext/ply/test/yacc_uprec.exp +++ /dev/null @@ -1,2 +0,0 @@ -./yacc_uprec.py:38: Nothing known about the precedence of 'UMINUS' -ply.yacc.YaccError: Unable to construct parser. diff --git a/ext/ply/test/yacc_uprec.py b/ext/ply/test/yacc_uprec.py index 0e8711e88..569adb8f9 100644 --- a/ext/ply/test/yacc_uprec.py +++ b/ext/ply/test/yacc_uprec.py @@ -4,9 +4,8 @@ # A grammar with a bad %prec specifier # ----------------------------------------------------------------------------- import sys -sys.tracebacklimit = 0 -sys.path.insert(0,"..") +if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens @@ -22,7 +21,7 @@ def p_statement_assign(t): def p_statement_expr(t): 'statement : expression' - print t[1] + print(t[1]) def p_expression_binop(t): '''expression : expression PLUS expression @@ -32,7 +31,7 @@ def p_expression_binop(t): if t[2] == '+' : t[0] = t[1] + t[3] elif t[2] == '-': t[0] = t[1] - t[3] elif t[2] == '*': t[0] = t[1] * t[3] - elif t[3] == '/': t[0] = t[1] / t[3] + elif t[2] == '/': t[0] = t[1] / t[3] def p_expression_uminus(t): 'expression : MINUS expression %prec UMINUS' @@ -51,11 +50,11 @@ def p_expression_name(t): try: t[0] = names[t[1]] except LookupError: - print "Undefined name '%s'" % t[1] + print("Undefined name '%s'" % t[1]) t[0] = 0 def p_error(t): - print "Syntax error at '%s'" % t.value + print("Syntax error at '%s'" % t.value) yacc.yacc() diff --git a/ext/ply/test/yacc_uprec2.py b/ext/ply/test/yacc_uprec2.py new file mode 100644 index 000000000..73274bfb6 --- /dev/null +++ b/ext/ply/test/yacc_uprec2.py @@ -0,0 +1,63 @@ +# ----------------------------------------------------------------------------- +# yacc_uprec2.py +# +# A grammar with a bad %prec specifier +# ----------------------------------------------------------------------------- +import sys + +if ".." not in sys.path: sys.path.insert(0,"..") +import ply.yacc as yacc + +from calclex import tokens + +# Parsing rules + +# dictionary of names +names = { } + +def p_statement_assign(t): + 'statement : NAME EQUALS expression' + names[t[1]] = t[3] + +def p_statement_expr(t): + 'statement : expression' + print(t[1]) + +def p_expression_binop(t): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + if t[2] == '+' : t[0] = t[1] + t[3] + elif t[2] == '-': t[0] = t[1] - t[3] + elif t[2] == '*': t[0] = t[1] * t[3] + elif t[2] == '/': t[0] = t[1] / t[3] + +def p_expression_uminus(t): + 'expression : MINUS expression %prec' + t[0] = -t[2] + +def p_expression_group(t): + 'expression : LPAREN expression RPAREN' + t[0] = t[2] + +def p_expression_number(t): + 'expression : NUMBER' + t[0] = t[1] + +def p_expression_name(t): + 'expression : NAME' + try: + t[0] = names[t[1]] + except LookupError: + print("Undefined name '%s'" % t[1]) + t[0] = 0 + +def p_error(t): + print("Syntax error at '%s'" % t.value) + +yacc.yacc() + + + + |