diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..13660b9 --- /dev/null +++ b/.clang-format @@ -0,0 +1,12 @@ +Language: Cpp +Standard: Cpp11 +BasedOnStyle: LLVM +BreakBeforeBraces: Linux +SpaceBeforeParens: Never +TabWidth: 2 +UseTab: Always +AlignTrailingComments: true +AllowShortIfStatementsOnASingleLine: false +IndentCaseLabels: true + +Cpp11BracedListStyle: true diff --git a/.gitignore b/.gitignore index 1c2cc89..0a2bc93 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,4 @@ *.code-workspace bin build - +etc diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..c90f9b5 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,27 @@ +INCLUDE (CheckIncludeFiles) + +# define project +cmake_minimum_required (VERSION 3.5) +project (jstrings VERSION 1.1 LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_COMPILER_NAMES clang++ g++ icpc c++ cxx) +# set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/bin) +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG") + +if (NOT EXISTS ${CMAKE_BINARY_DIR}/CMakeCache.txt) + if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE) + endif() +endif() + +# define target +aux_source_directory(${PROJECT_SOURCE_DIR}/src CPPFILES) +add_executable(jstrings ${CPPFILES}) + +target_include_directories(jstrings PUBLIC "${PROJECT_SOURCE_DIR}/inc") +target_compile_features(jstrings PUBLIC cxx_std_11) +target_link_libraries(jstrings png) + +install(TARGETS jstrings + RUNTIME DESTINATION bin) diff --git a/LICENSE b/LICENSE index 94a9ed0..a86c458 100644 --- a/LICENSE +++ b/LICENSE @@ -1,674 +1,21 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. +MIT License + +Copyright (c) 2018-2019 Damian Rogers + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/Makefile b/Makefile deleted file mode 100644 index a8c5819..0000000 --- a/Makefile +++ /dev/null @@ -1,29 +0,0 @@ -CC := g++ -CFLAGS := -g -Wall - -SRCDIR := src -BUILDDIR := build -BINDIR := bin - -TARGET := $(BINDIR)/jstrings -INC := -I include - -SRCEXT := cpp -SRC := $(shell find $(SRCDIR) -type f -name *.$(SRCEXT)) -OBJ := $(patsubst $(SRCDIR)/%,$(BUILDDIR)/%,$(SRC:.$(SRCEXT)=.o)) - -$(TARGET): $(OBJ) - @mkdir -p $(BINDIR) - $(CC) $^ -o $(TARGET) - -$(BUILDDIR)/%.o: $(SRCDIR)/%.$(SRCEXT) - @mkdir -p $(BUILDDIR) - $(CC) $(CFLAGS) $(INC) -c -o $@ $< - -clean: - $(RM) -r $(BUILDDIR)/* $(TARGET) - -init: - mkdir $(BUILDDIR) - mkdir $(BINDIR) - diff --git a/README.md b/README.md index 0bcc489..d2a30de 100644 --- a/README.md +++ b/README.md @@ -1,35 +1,47 @@ # jstrings -A tool for finding JIS-based Japanese characters in binary data. +A tool for finding JIS-based Japanese text in binary data. ## Usage - jstrings [options] [input] + jstrings [options] [input_file] -Input can be a filename or data from stdin. +Input can be a filename or data from stdin. Output is sent to stdout. ### Options - -m number + -e encoding + --encoding encoding -Set minimum number of characters to match as a valid string. Default: 10. +Specify the encoding to use. Use one of the strings listed in parantheses below for that encoding: - -e encoding +* Shift-JIS (shift-jis, shiftjis, sjis) +* EUC-JP (euc, euc-jp, eucjp) +* Microsoft CP932 (cp932, windows932, windows31j) -Specify the encoding to use. Currently, the only valid value is "shift-jis". Default: shift-jis +Optional; default is Shift-JIS. - -l + -m number + --match-length number -Use little-endian order for multibyte characters +Set minimum number of characters to match as a valid string. Optional; default is 10. - -jisx0213 + -c number + --cutoff number -Use JIS X 0213 character set instead of JIS X 0208 for double byte characters +Limit the output to the specified number of characters for a string. This is useful for "previewing" a file which may have large blocks of junk data that happen to fall within the range of valid encoding values. Optional; default is no cutoff. -## Notes +## Output Data is output in its original encoding without any conversion. Other tools, such as iconv, can do conversion to something more useful (such as UTF8). For example: - jstrings file.bin | iconv -f SHIFT-JIS -t UTF-8 -c + # for Shift-JIS + jstrings file.bin | iconv -f SHIFT-JIS -t UTF-8 -c | less + # for CP932 + jstrings file.bin | iconv -f CP932 -t UTF-8 -c | less + # for EUC-JP + jstrings file.bin | iconv -f EUC-JP -t UTF-8 -c | less -### To Do -- Add support for other JIS encodings: CP932, EUC -- Add support for JIS X 0212 for non-SJIS encodings (only EUC?) -- Add option to only return strings with double-byte characters present +## Building +CMake is used for the build system. From the root directory: + mkdir build && cd build + cmake .. + make + sudo make install diff --git a/inc/enc_cp932.h b/inc/enc_cp932.h new file mode 100644 index 0000000..c6ae1ec --- /dev/null +++ b/inc/enc_cp932.h @@ -0,0 +1,24 @@ +/*! + * \author Damian Rogers (damian@sudden-desu.net) + * \version 1.1 + * \date 2019.12.01 + * \copyright MIT License + */ + +#ifndef ENC_CP932_H +#define ENC_CP932_H + +#include "enc_shiftjis.h" + +namespace encodings +{ + +class cp932 : public shift_jis +{ +public: + u8 is_valid(u8 const *data); +}; + +} // namespace encodings + +#endif \ No newline at end of file diff --git a/inc/enc_eucjp.h b/inc/enc_eucjp.h new file mode 100644 index 0000000..c03a05c --- /dev/null +++ b/inc/enc_eucjp.h @@ -0,0 +1,22 @@ +/*! + * \author Damian Rogers (damian@sudden-desu.net) + * \version 1.1 + * \date 2019.12.01 + * \copyright MIT License + */ +#ifndef ENC_EUCJP_H +#define ENC_EUCJP_H +#include "encoding.h" + +namespace encodings +{ + +class euc : public encoding +{ +public: + euc() : encoding(3){}; + u8 is_valid(u8 const *data); +}; + +} // namespace encodings +#endif // ENC_EUC_H diff --git a/inc/enc_shiftjis.h b/inc/enc_shiftjis.h new file mode 100644 index 0000000..d38d172 --- /dev/null +++ b/inc/enc_shiftjis.h @@ -0,0 +1,23 @@ +/*! + * \author Damian Rogers (damian@sudden-desu.net) + * \version 1.1 + * \date 2019.12.01 + * \copyright MIT License + */ + +#ifndef ENC_SHIFTJIS_H +#define ENC_SHIFTJIS_H +#include "encoding.h" + +namespace encodings +{ + +class shift_jis : public encoding +{ +public: + shift_jis() : encoding(2){}; + u8 is_valid(u8 const *data); +}; + +} // namespace encodings +#endif // ENC_SHIFTJIS_H diff --git a/inc/encoding.h b/inc/encoding.h new file mode 100644 index 0000000..1d15f44 --- /dev/null +++ b/inc/encoding.h @@ -0,0 +1,32 @@ +/*! + * \author Damian Rogers (damian@sudden-desu.net) + * \version 1.1 + * \date 2019.12.01 + * \copyright MIT License + */ + +#ifndef ENCODING_H +#define ENCODING_H +#include "types.h" + +/*! + * \brief Abstract for encoding classes + */ +class encoding +{ +public: + encoding(u8 max_seq_len) { this->max_seq_len = max_seq_len; } + + /*! + * \brief Determines if the given bytes are a valid byte sequence for the + * encoding. Returns the number of valid bytes if true. + */ + virtual u8 is_valid(u8 const *data) = 0; + + const u8 get_max_seq_len() { return this->max_seq_len; } + +protected: + u8 max_seq_len; +}; + +#endif // ENCODING_H \ No newline at end of file diff --git a/inc/main.h b/inc/main.h new file mode 100644 index 0000000..aca4044 --- /dev/null +++ b/inc/main.h @@ -0,0 +1,31 @@ +/*! + * \author Damian Rogers (damian@sudden-desu.net) + * \version 1.1 + * \date 2019.12.01 + * \copyright MIT License + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "enc_cp932.h" +#include "enc_eucjp.h" +#include "enc_shiftjis.h" +#include "encoding.h" +#include "types.h" + +/*! + * \enum encodings + * \brief List of supported encodings + */ +enum enctypes { shift_jis, cp932, eucjp }; + +void process_args(int argc, char **argv); + +void print_help(); diff --git a/inc/types.h b/inc/types.h new file mode 100644 index 0000000..ca27fd1 --- /dev/null +++ b/inc/types.h @@ -0,0 +1,48 @@ +/*! + * \author Damian Rogers (damian@sudden-desu.net) + * \version 1.1 + * \date 2019.12.01 + * \copyright MIT License + */ + +#ifndef TYPES_H +#define TYPES_H + +#include +#include +#include +#include + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +template using uptr = std::unique_ptr; +template using sptr = std::shared_ptr; + +typedef std::map kvmap; + +/*! + * \brief POD structure for containing a found string + */ +class found_string +{ +public: + /*! + * \brief The offset of the beginning of the found string relative to the + * start of the stream + */ + off_t address; + /*! + * \brief The extracted string data + */ + std::vector data; +}; + +#endif \ No newline at end of file diff --git a/include/jis_enc.h b/include/jis_enc.h deleted file mode 100644 index 2424e3f..0000000 --- a/include/jis_enc.h +++ /dev/null @@ -1,78 +0,0 @@ -/*! - * \author Damian Rogers (damian@sudden-desu.net) - * \version 1.0 - * \date 2017.12.30 - * \copyright GNU Public License -*/ -#ifndef JIS_ENC_H -#define JIS_ENC_H -#include -#include - -/*! - * \brief POD structure for containing a found string -*/ -struct found_string { - /*! - * \brief The offset of the beginning of the found string relative to the start of the stream - */ - off_t address; - /*! - * \brief The extracted string data - */ - std::vector data; -}; - -/*! - * \brief Abstract class for JIS based encoding classes -*/ -class jis_enc { -protected: - std::istream* instream; - /*! - * \brief Minimum number of characters to match to count as a found string - */ - size_t min_len = 10; - /*! - * \brief Determines byte order for multibyte characters - */ - bool is_big_endian = true; - /*! - * \brief Use the JIS X 0213 character set instead of JIS X 0208 - */ - bool use_jisx0213 = false; - -public: - jis_enc(std::istream* instream); - virtual ~jis_enc(); - /*! - * \brief Performs the search of the data stream - */ - virtual std::vector* find() = 0; - /*! - * \brief Setter for min_len - */ - void set_min_len(size_t min_len); - /*! - * \brief Getter for min_len - */ - size_t get_min_len(); - /*! - * \brief Setter for is_big_endian - */ - void set_is_big_endian(bool is_big_endian); - /*! - * \brief Getter for is_big_endian - */ - bool get_is_big_endian(); - /*! - * \brief Setter for use_jisx0213 - */ - void set_use_jisx0213(bool use_jisx0213); - /*! - * \brief Getter for use_jisx0213 - */ - bool get_use_jisx0213(); -}; - -#endif // JIS_ENC_H \ No newline at end of file diff --git a/include/main.h b/include/main.h deleted file mode 100644 index 4d65e40..0000000 --- a/include/main.h +++ /dev/null @@ -1,29 +0,0 @@ -/*! - * \brief A tool for finding JIS-based character strings in a binary stream - * \author Damian Rogers (damian@sudden-desu.net) - * \version 1.0 - * \date 2017.12.30 - * \copyright GNU Public License - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include "jis_enc.h" -#include "shift_jis.h" - -/*! - * \enum encodings - * \brief List of supported JIS encodings - */ - -enum encodings { shift_jis_enc, cp932_enc, euc_enc }; - -void process_args(int argc, char **argv); - -void print_help(); diff --git a/include/shift_jis.h b/include/shift_jis.h deleted file mode 100644 index b4e1c7e..0000000 --- a/include/shift_jis.h +++ /dev/null @@ -1,41 +0,0 @@ -/*! - * \brief Find Shift-JIS encoded strings in a byte stream - * \author Damian Rogers (damian@sudden-desu.net) - * \version 1.0 - * \date 2017.12.30 - * \copyright GNU Public License -*/ -#ifndef SHIFT_JIS_H -#define SHIFT_JIS_H -#include -#include -#include -#include "jis_enc.h" - -class shift_jis : public jis_enc { - static bool is_byte2_valid(const uint8_t* c); - -public: - shift_jis(std::istream* instream) : jis_enc(instream) {}; - ~shift_jis(); - std::vector* find(); - /*! - * \brief Determines if a given byte is valid for JIS X 0201 and is printable - * \return True if the byte is valid - * - * In this case, valid means not within the reserved range (0x80 to 0xa0, 0xe0 to 0xff) and not an ASCII control character (0x00 to 0x1f, 0x7f) - */ - static bool is_jisx0201_printable(const uint8_t* c); - /*! - * \brief Determines if the given bytes make up a valid JIS X 0208 character - * \return True if the bytes are valid - */ - static bool is_jisx0208(const uint8_t* c_h, const uint8_t* c_l); - /*! - * \brief Determines if the given bytes make up a valid JIS X 0213 character - * \return True if the bytes are valid - */ - static bool is_jisx0213(const uint8_t* c_h, const uint8_t* c_l); -}; - -#endif // SHIFT_JIS_H diff --git a/src/enc_cp932.cpp b/src/enc_cp932.cpp new file mode 100644 index 0000000..2188654 --- /dev/null +++ b/src/enc_cp932.cpp @@ -0,0 +1,45 @@ +#include "enc_cp932.h" + +namespace encodings +{ + +u8 cp932::is_valid(u8 const *data) +{ + u8 valid_count = shift_jis::is_valid(data); + if(valid_count > 0) + return valid_count; + else { + u8 c_hi{*data}; + u8 c_lo{*(data + 1)}; + + /* + ku 13 - lead byte 0x87 - NEC tokushu moji + 0x40 to 0x5d, 0x5f to 0x75, 0x7e, 0x80 to 0x8f, 0x93, 0x94, 0x98, 0x99 + ku 89-92 - lead byte 0xed, 0xee - NEC sentei IBM kakuchou moji + 0x40 to 0x7e, 0x80 to 0xfc + ku 115-119 - lead byte 0xfa to 0xfc - IBM kakuchou moji + 0x40 to 0x7e, 0x80 to 0xfc (except 0xfc: 0x40 to 0x4b) + */ + switch(c_hi) { + case 0x87: + if(((c_lo >= 0x40) & (c_lo <= 0x5d)) || + ((c_lo >= 0x5f) & (c_lo <= 0x75)) || (c_lo == 0x7e) || + ((c_lo >= 0x80) & (c_lo <= 0x8f)) || (c_lo == 0x93) || + (c_lo == 0x94) || (c_lo == 0x98) || (c_lo == 0x99)) + return 2; + break; + case 0xed: + case 0xee: + case 0xfa: + case 0xfb: + // 0x7f should already be excluded from the base shiftjis check + if((c_lo >= 0x40) & (c_lo <= 0xfc)) + return 2; + case 0xfc: + if((c_lo >= 0x40) & (c_lo <= 0x4b)) + return 2; + } + } + return 0; +} +} // namespace encodings \ No newline at end of file diff --git a/src/enc_eucjp.cpp b/src/enc_eucjp.cpp new file mode 100644 index 0000000..091bc20 --- /dev/null +++ b/src/enc_eucjp.cpp @@ -0,0 +1,157 @@ +#include "enc_eucjp.h" + +#include +#include + +namespace encodings +{ +u8 euc::is_valid(u8 const *data) +{ + + u8 c_hi{*data}; + + // ASCII except control characters + if((c_hi == 0x09) || (c_hi >= 0x20) & (c_hi <= 0x7e)) + return 1; + + u8 c_lo{*(data + 1)}; + + // JIS X 0201 + // the raw 0201 code prefixed with 0x8E + if(c_hi == 0x8e) { + if((c_lo >= 0xa1) & (c_lo <= 0xdf)) + return 2; + } + /* + // JIS X 0208 + // standard 0208 code with top bit set + // Partial ku + // 0xa2 - 0xa1 to 0xae, 0xba to 0xc1, 0xca to 0xd0, 0xdc to 0xea, 0xf2 to + 0xf9, 0xfe + // 0xa3 - 0xb0 to 0xb9, 0xc1 to 0xda, 0xe1 to 0xfa + // 0xa4 - 0xa1 to 0xf3 + // 0xa5 - 0xa1 to 0xf6 + // 0xa6 - 0xa1 to 0xb8, 0xc1 to 0xd8 + // 0xa7 - 0xa1 to 0xc1, 0xd1 to 0xf1 + // 0xa8 - 0xa1 to 0xc0 + // 0xcf - 0xa1 to 0xd3 + // 0xf4 - 0xa1 to 0xa6 + */ + switch(c_hi) { + case 0xa2: + if((c_lo >= 0xa1) & (c_lo <= 0xae) || (c_lo >= 0xba) & (c_lo <= 0xc1) || + (c_lo >= 0xca) & (c_lo <= 0xd0) || (c_lo >= 0xdc) & (c_lo <= 0xea) || + (c_lo >= 0xf2) & (c_lo <= 0xf9) || (c_lo == 0xfe)) + return 2; + return false; + case 0xa3: + if((c_lo >= 0xb0) & (c_lo <= 0xb9) || (c_lo >= 0xc1) & (c_lo <= 0xda) || + (c_lo >= 0xe1) & (c_lo <= 0xfa)) + return 2; + return false; + case 0xa4: + if((c_lo >= 0xa1) & (c_lo <= 0xf3)) + return 2; + return false; + case 0xa5: + if((c_lo >= 0xa1) & (c_lo <= 0xf6)) + return 2; + return false; + case 0xa6: + if((c_lo >= 0xa1) & (c_lo <= 0xb8) || (c_lo >= 0xc1) & (c_lo <= 0xd8)) + return 2; + return false; + case 0xa7: + if((c_lo >= 0xa1) & (c_lo <= 0xc1) || (c_lo >= 0xd1) & (c_lo <= 0xf1)) + return 2; + return false; + case 0xa8: + if((c_lo >= 0xa1) & (c_lo <= 0xc0)) + return 2; + return false; + case 0xcf: + if((c_lo >= 0xa1) & (c_lo <= 0xd3)) + return 2; + return false; + case 0xf4: + if((c_lo >= 0xa1) & (c_lo <= 0xa6)) + return 2; + return false; + } + + // Full ku (lo bytes 0xa1 to 0xfe) + // 0xa1, 0xb0 to 0xce, 0xd0 to 0xf3 + if(((c_hi == 0xa1) || ((c_hi >= 0xb0) & (c_hi <= 0xce)) || + ((c_hi >= 0xd0) & (c_hi <= 0xf3))) && + ((c_lo >= 0xa1) & (c_lo <= 0xfe))) + return 2; + + // JIS X 0212 + // 0208 extension, so only a few ku are present + // 0208 code prefixed by 0x8f + if(c_hi == 0x8f) { + u8 c_md = c_lo; + c_lo = *(data + 2); + + /* + // Partial ku + // 0xa2 - 0xaf to 0xb9, 0xc2 to 0xc4, 0xeb to 0xf1 + // 0xa6 - 0xe1 to 0xe5, 0xe7, 0xe9, 0xea, 0xec, 0xf1 to 0xfc + // 0xa7 - 0xc2 to 0xce, 0xf2 to 0xfe + // 0xa9 - 0xa1, 0xa2, 0xa4, 0xa6, 0xa8, 0xa9, 0xab to 0xad, 0xaf, 0xb0, 0xc1 + to 0xd0 + // 0xaa - 0xa1 to 0xb8, 0xba to 0xf7 + // 0xab - 0xa1 to 0xbb, 0xbd to 0xc3, 0xc5 to 0xf7 + // 0xed - 0xa1 to 0xe3 + */ + switch(c_md) { + case 0xa2: + if(((c_lo >= 0xaf) & (c_lo <= 0xb9)) || + ((c_lo >= 0xc2) & (c_lo <= 0xc4)) || + ((c_lo >= 0xeb) & (c_lo <= 0xf1))) + return 3; + return false; + case 0xa6: + if(((c_lo >= 0xe1) & (c_lo <= 0xe5)) || (c_lo == 0xe7) || + (c_lo == 0xe9) || (c_lo == 0xea) || (c_lo == 0xec) || + ((c_lo >= 0xf1) & (c_lo <= 0xfc))) + return 3; + return false; + case 0xa7: + if(((c_lo >= 0xc2) & (c_lo <= 0xce)) || + ((c_lo >= 0xf2) & (c_lo <= 0xfe))) + return 3; + return false; + case 0xa9: + if((c_lo == 0xa1) || (c_lo == 0xa2) || (c_lo == 0xa4) || + (c_lo == 0xa6) || (c_lo == 0xa8) || (c_lo == 0xa9) || + ((c_lo >= 0xab) & (c_lo <= 0xad)) || (c_lo == 0xaf) || + (c_lo == 0xb0) || ((c_lo >= 0xc1) & (c_lo <= 0xd0))) + return 3; + return false; + case 0xaa: + if(((c_lo >= 0xa1) & (c_lo <= 0xb8)) || + ((c_lo >= 0xba) & (c_lo <= 0xf7))) + return 3; + return false; + case 0xab: + if(((c_lo >= 0xa1) & (c_lo <= 0xbb)) || + ((c_lo >= 0xbd) & (c_lo <= 0xc3)) || + ((c_lo >= 0xc5) & (c_lo <= 0xf7))) + return 3; + return false; + case 0xed: + if((c_lo >= 0xa1) & (c_lo <= 0xe3)) + return 3; + return false; + } + + // Full ku (lo bytes 0xa1 to 0xfe) + // 0xb0 to 0xec + if(((c_md >= 0xb0) & (c_md <= 0xec)) && ((c_lo >= 0xa1) & (c_lo <= 0xfe))) + return 3; + } + return false; +} + +} // namespace encodings \ No newline at end of file diff --git a/src/enc_shiftjis.cpp b/src/enc_shiftjis.cpp new file mode 100644 index 0000000..a745c85 --- /dev/null +++ b/src/enc_shiftjis.cpp @@ -0,0 +1,87 @@ +#include "enc_shiftjis.h" + +#include +#include + +namespace encodings +{ +/* + This supports traditional Shift-JIS, which encompasses JIS X 0201 and JIS X + 0208 There is extended support for 0213, though we're not going to fiddle with + it Maybe we'll make an extended class +*/ +u8 shift_jis::is_valid(u8 const *data) +{ + // JIS X 0201 - 8-bit characters (including 7-bit ASCII) + // excludes non-printable (control code) and reserved bytes + // (but include tab (0x09)) + u8 c_hi{*data}; + if((c_hi == 0x09) || (c_hi >= 0x20) & (c_hi <= 0x7e) || + (c_hi >= 0xa1) & (c_hi <= 0xdf)) + return 1; + + // JIS X 0208 - 16 bit characters + u8 c_lo{*(data + 1)}; + + // sjis lower byte can never be these values + if((c_lo >= 0x0) & (c_lo <= 0x3f) || (c_lo == 0x7f) || + (c_lo >= 0xfd) & (c_lo <= 0xff)) + return false; + + // we've determined the second byte is valid as part of an SJIS encoded pair + // if we're in fast mode, that's good enough; return + // if(!accurate_mode) + // return true; + + // Partial fields (always excluding 0x7f) + // 0x81 - 0x40 to 0xac, 0xb8 to 0xbf, 0xc8 to 0xce, 0xda to 0xe8, 0xf0 to + // 0xf7, 0xfc 0x82 - 0x4f to 0x58, 0x60 to 0x79, 0x81 to 0x9a, 0x9f to 0xf1 + // 0x83 - 0x40 to 0x96, 0x9f to 0xb6, 0xbf to 0xd6 + // 0x84 - 0x40 to 0x60, 0x70 to 0x91, 0x9f to 0xbe + // 0x88 - 0x9f to 0xfc + // 0x98 - 0x40 to 0x72, 0x9f to 0xfc + // 0xea - 0x40 to 0xa4 + switch(c_hi) { + case 0x81: + if((c_lo >= 0x40) & (c_lo <= 0xac) || (c_lo >= 0xb8) & (c_lo <= 0xbf) || + (c_lo >= 0xc8) & (c_lo <= 0xce) || (c_lo >= 0xda) & (c_lo <= 0xe8) || + (c_lo >= 0xf0) & (c_lo <= 0xf7) || (c_lo == 0xfc)) + return 2; + return false; + case 0x82: + if((c_lo >= 0x4f) & (c_lo <= 0x58) || (c_lo >= 0x60) & (c_lo <= 0x79) || + (c_lo >= 0x81) & (c_lo <= 0x9a) || (c_lo >= 0x9f) & (c_lo <= 0xf1)) + return 2; + return false; + case 0x83: + if((c_lo >= 0x40) & (c_lo <= 0x96) || (c_lo >= 0x9f) & (c_lo <= 0xb6) || + (c_lo >= 0xbf) & (c_lo <= 0xd6)) + return 2; + return false; + case 0x84: + if((c_lo >= 0x40) & (c_lo <= 0x60) || (c_lo >= 0x70) & (c_lo <= 0x91) || + (c_lo >= 0x9f) & (c_lo <= 0xbe)) + return 2; + return false; + case 0x88: + if((c_lo >= 0x9f) & (c_lo <= 0xfc)) + return 2; + return false; + case 0x98: + if((c_lo >= 0x40) & (c_lo <= 0x72) || (c_lo >= 0x9f) & (c_lo <= 0xfc)) + return 2; + return false; + case 0xea: + if((c_lo >= 0x40) & (c_lo <= 0xa4)) + return 2; + return false; + } + // Full fields (0x40 to 0xfc, excluding 0x7f) + // 0x89 to 0x97, 0x99 to 0xe9 + if((((c_hi >= 0x89) & (c_hi <= 0x97)) || ((c_hi >= 0x99) & (c_hi <= 0xe9))) && + ((c_lo >= 0x40) & (c_lo <= 0xfc))) + return 2; + return false; +} + +} // namespace encodings diff --git a/src/jis_enc.cpp b/src/jis_enc.cpp deleted file mode 100644 index 2f649b5..0000000 --- a/src/jis_enc.cpp +++ /dev/null @@ -1,34 +0,0 @@ -#include -#include "jis_enc.h" - -jis_enc::jis_enc(std::istream* instream) { - this->instream = instream; -} - -jis_enc::~jis_enc() { - -} - -void jis_enc::set_min_len(size_t min_len) { - this->min_len = min_len; -} - -size_t jis_enc::get_min_len() { - return this->min_len; -} - -void jis_enc::set_is_big_endian(bool is_big_endian) { - this->is_big_endian = is_big_endian; -} - -bool jis_enc::get_is_big_endian() { - return this->is_big_endian; -} - -void jis_enc::set_use_jisx0213(bool use_jisx0213) { - this->use_jisx0213 = use_jisx0213; -} - -bool jis_enc::get_use_jisx0213() { - return this->use_jisx0213; -} diff --git a/src/main.cpp b/src/main.cpp index efb8d9e..11a425b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,135 +1,284 @@ - #include "main.h" +#ifdef DEBUG +#include +#endif + using namespace std; -const string version = string("0.2"); +static const string version = string("1.0"); + +// 512k of buffer +static u32 const DATABUFF_SIZE = 524288; +static u8 const DEFAULT_MATCH_LEN = 10; istream *indata = nullptr; -size_t min_len = 0; -encodings enc = shift_jis_enc; -bool big_endian = true; -bool jisx0213 = false; - -int main(int argc, char **argv) { - process_args(argc, argv); - - if (indata == nullptr) - indata = &cin; - else { - if (!indata->good()) { - cerr << "File could not be opened" << endl; - return 2; - } - indata->seekg(0); - } - - jis_enc *encoding = nullptr; - - switch (enc) { - case shift_jis_enc: - encoding = new shift_jis(indata); - break; - default: - cerr << "Encoding not yet supported" << endl; - return 3; - } - - // set up command line values - if (min_len > 0) - encoding->set_min_len(min_len); - encoding->set_is_big_endian(big_endian); - encoding->set_use_jisx0213(jisx0213); - - auto res = encoding->find(); - found_string thisstring; - cout << showbase << internal << setfill('0') << hex; - - for (size_t siter = 0; siter < res->size(); siter++) { - thisstring = res->at(siter); - cout << thisstring.address << " " << &thisstring.data[0] << endl; - } - - if (indata != &cin) - delete indata; - delete encoding; - delete res; - - return 0; +size_t match_len = DEFAULT_MATCH_LEN; +size_t str_cutoff{0}; +string encoding_str = ""; + +static const map enclist{ + {"shift-jis", shift_jis}, {"shiftjis", shift_jis}, {"sjis", shift_jis}, + {"cp932", cp932}, {"windows932", cp932}, {"windows31j", cp932}, + {"euc", eucjp}, {"euc-jp", eucjp}, {"eucjp", eucjp}}; + +int main(int argc, char **argv) +{ + encoding *encoding = nullptr; + vector results; + + try { + // SETUP + process_args(argc, argv); + + if(indata == nullptr) + indata = &cin; + else { + if(!indata->good()) { + throw invalid_argument("File could not be opened"); + } + indata->seekg(0); + } + + if(encoding_str.empty()) + encoding = new encodings::shift_jis(); + else { + if(enclist.find(encoding_str) == enclist.end()) { + throw invalid_argument("Invlaid encoding specified"); + } + + switch(enclist.at(encoding_str)) { + case shift_jis: + encoding = new encodings::shift_jis(); + break; + case eucjp: + encoding = new encodings::euc(); + break; + case cp932: + encoding = new encodings::cp932(); + break; + default: + cerr << "Encoding not yet supported" << endl; + return 3; + } + } + +#ifdef DEBUG + std::chrono::high_resolution_clock::time_point t1 = + std::chrono::high_resolution_clock::now(); +#endif + + // SEARCH + // - read buffer chunk + // - pass pointer to chunk offset to is_valid + // - if return value > 0 + // -- add (return value) chars to current string + // -- move buffer point +(return value) + // - if return value <= 0 + // -- are there enough chars in our temp string to count as a found string? + // --- if yes, add string to list + // -- clear temp string + // start over + vector found_strings(); + u8 databuff[DATABUFF_SIZE]; + streamsize bytecount; + u32 work_iter; + // where we are in reading the data chunk buffer + u32 databuff_ptr{0}; + // number of valid bytes returns from the encoding + u8 validcount; + // work string; where we dump valid bytes + found_string workstr; + workstr.data.reserve(match_len); + // the databuff_ptr value when we should read another chunk + u32 buffborder; + // cache this... + u8 enc_max_seqlen = encoding->get_max_seq_len(); + // tracks where we are in the file + u64 stream_ptr{0}; + s16 glyphcount{0}; + u32 this_buffsize = DATABUFF_SIZE; + u32 this_buffoffset{0}; + + while(1) { + if(indata->eof()) + break; + // read a chunk and count how many bytes were actually captured + bytecount = + indata->read((char *)(databuff + this_buffoffset), this_buffsize) + .gcount(); + if(bytecount < 1) + break; + + // cache this too... + buffborder = bytecount - enc_max_seqlen; + + for(databuff_ptr = 0; databuff_ptr < bytecount;) { + // check the databuff pointer + // is it within bytecount - (encoding max_seq_len) ? + // if so, repoint the stream and read another chunk + if(databuff_ptr >= buffborder) { + // step the stream pointer back if needed + // indata->seekg(0 - (bytecount - databuff_ptr), ios::cur); + // well of course we can't seekg on stdin, so let's make things more + // complicated + // we'll copy the remaining bytes to the beginning of the buffer + // then have chunk reader bring in that many less bytes + // what a mess... + std::copy(&databuff[DATABUFF_SIZE - (bytecount - databuff_ptr)], + &databuff[DATABUFF_SIZE], &databuff[0]); + this_buffsize = DATABUFF_SIZE - (bytecount - databuff_ptr); + this_buffoffset = bytecount - databuff_ptr; + + // break out and reread a chunk + break; + } + + validcount = encoding->is_valid(&databuff[databuff_ptr]); + if(validcount > 0) { + // the data is a valid glyph + // add to the work string + if(glyphcount == 0) { + // this is the first character, so store the address where the + // beginning of the string was found + workstr.address = stream_ptr; + } + glyphcount++; + if(str_cutoff > 0 && glyphcount >= str_cutoff) { + databuff_ptr += validcount; + stream_ptr += validcount; + continue; + } + std::copy(&databuff[databuff_ptr], + &databuff[databuff_ptr + validcount], + std::back_inserter(workstr.data)); + databuff_ptr += validcount; + stream_ptr += validcount; + } else { + // data is invalid + // if there are enough characters in the work string, add it to the + // list + if(glyphcount >= match_len) { + workstr.data.push_back('\0'); + results.push_back(workstr); + } + ++databuff_ptr; + ++stream_ptr; + if(glyphcount > 0) { + glyphcount = 0; + workstr.data.clear(); + workstr.data.reserve(match_len); + } + } + } + } + +#ifdef DEBUG + std::chrono::high_resolution_clock::time_point t2 = + std::chrono::high_resolution_clock::now(); + auto duration = + std::chrono::duration_cast(t2 - t1).count(); + + cerr << "Search duration: " << duration << endl; + + t1 = std::chrono::high_resolution_clock::now(); +#endif + + // RESULTS + found_string thisstring; + + std::cout << showbase << internal << setfill('0') << hex; + + for(size_t siter = 0; siter < results.size(); siter++) { + thisstring = results.at(siter); + std::cout << thisstring.address << " " << &thisstring.data[0] << endl; + } + + if(indata != &cin) + delete indata; + delete encoding; + // delete results; + + return 0; + } catch(const exception &e) { + cerr << "Fatal error: " << e.what() << endl; + + if(indata != &cin) + delete indata; + delete encoding; + // delete results; + + return -1; + } } -void process_args(int argc, char **argv) { - // TODO: add option for double-byte strings bias - // OPTION -d - prefer double-byte strings - - const char *const short_opts = ":hm:e:lx"; - const option long_opts[] = {{"help", no_argument, nullptr, 'h'}, - {"min-length", required_argument, nullptr, 'm'}, - {"encoding", required_argument, nullptr, 'e'}, - {"little-endian", no_argument, nullptr, 'l'}, - {"jisx0213", no_argument, nullptr, 'x'}, - {nullptr, 0, nullptr, 0}}; - - while (true) { - const auto this_opt = - getopt_long(argc, argv, short_opts, long_opts, nullptr); - - if (this_opt == -1) - break; - - switch (this_opt) { - case 'm': - // OPTION -m - set minimum length of string - min_len = strtoul(optarg, nullptr, 10); - break; - case 'e': - // OPTION -e - set encoding - // TODO: move encodings into a map - if (!strcmp(optarg, "shift-jis") || !strcmp(optarg, "sjis")) - enc = shift_jis_enc; - else { - cerr << "Unsupported encoding, defaulting to Shift-JIS" << endl; - } - break; - case 'l': - // OPTION -l - little-endian - big_endian = false; - break; - case 'x': - // OPTION -jisx0213 - Use JIS X 0213 character set - jisx0213 = true; - break; - case 'h': - print_help(); - exit(0); - break; - case ':': - cerr << "Missing argument" << endl; - print_help(); - exit(1); - break; - case '?': - cerr << "Invalid option" << endl; - print_help(); - exit(1); - break; - default: - print_help(); - exit(1); - break; - } - } - - if (optind < argc) { - // only read the first non-option argument, assuming it is input filename - indata = new ifstream(argv[optind]); - } +void process_args(int argc, char **argv) +{ + const char *const short_opts = ":hm:e:lxf"; + const option long_opts[] = {{"help", no_argument, nullptr, 'h'}, + {"match-length", required_argument, nullptr, 'm'}, + {"cutoff", required_argument, nullptr, 'c'}, + {"encoding", required_argument, nullptr, 'e'}, + {nullptr, 0, nullptr, 0}}; + + while(true) { + const auto this_opt = + getopt_long(argc, argv, short_opts, long_opts, nullptr); + + if(this_opt == -1) + break; + + switch(this_opt) { + case 'm': + match_len = strtoul(optarg, nullptr, 10); + if(match_len < 1) + throw invalid_argument("Match length must be a positive value"); + break; + case 'c': + str_cutoff = strtoul(optarg, nullptr, 10); + if(str_cutoff < 1) + throw invalid_argument("Max length must be a positive value"); + break; + case 'e': + encoding_str = argv[optind]; + break; + case 'h': + print_help(); + exit(0); + break; + case ':': + cerr << "Missing argument" << endl; + print_help(); + exit(1); + break; + case '?': + cerr << "Invalid option" << endl; + print_help(); + exit(1); + break; + default: + print_help(); + exit(1); + break; + } + } + + if(optind < argc) { + // only read the first non-option argument, assuming it is input filename + indata = new ifstream(argv[optind]); + } } -void print_help() { - cerr << "jstrings version " << version << endl << endl; - cerr << "Valid options:" << endl; - cerr << " --encoding, -e Specify encoding to use" << endl; - cerr << " --jisx0213 Include JIS X 0213 definitions" << endl; - cerr << " --help, -h Display this text" << endl; +void print_help() +{ + cerr << "jstrings version " << version << endl << endl; + cerr << "Valid options:" << endl; + cerr << " --encoding, -e Specify encoding to use" << endl; + cerr << " (Valid options: shiftjis, cp932, eucjp)" << endl; + cerr << " --match-length, -m Specify number of sequential characters " + "required to qualify as a string" + << endl; + cerr << " --cutoff, -c Specify maximum number of characters to " + "display in a single string" + << endl; } diff --git a/src/shift_jis.cpp b/src/shift_jis.cpp deleted file mode 100644 index 0b0fb8d..0000000 --- a/src/shift_jis.cpp +++ /dev/null @@ -1,216 +0,0 @@ -#include -#include -#include "jis_enc.h" -#include "shift_jis.h" - -using namespace std; - -// sjis information comes from: -// JWPce tables -// the Yamaha RT series page - http://www.rtpro.yamaha.co.jp/RT/docs/misc/kanji-sjis.html -// the Unicode consortium conversion sheet - ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT -// http://www.asahi-net.or.jp/~ax2s-kmtn/ref/index.html - -shift_jis::~shift_jis() {} - -bool shift_jis::is_byte2_valid(const uint8_t* c) { - // sjis byte 2 can never be these values - if((*c >= 0x0) & (*c <= 0x3f) || - (*c == 0x7f) || - (*c >= 0xfd) & (*c <= 0xff)) - return false; - return true; -} - -vector* shift_jis::find() { - - // list of found strings - vector* found_strings = new vector; - - // work bytes - char tempc; - uint8_t thisc, nextc; - - // work string - found_string this_str; - this_str.address = -1; - this_str.data.reserve(this->min_len); - - // choose which double byte filter to use - bool (*jisx_version)(const uint8_t*, const uint8_t*); - if(this->use_jisx0213) jisx_version = shift_jis::is_jisx0213; - else jisx_version = shift_jis::is_jisx0208; - - // stream read loop - while(this->instream->get(tempc)) { - thisc = (uint8_t)tempc; - - // check for JIS X 0201 (single byte) - if(this->is_jisx0201_printable(&thisc)) { - if(this_str.address < 0) this_str.address = this->instream->tellg(); - this_str.data.push_back(thisc); - continue; - } - - // check for JIS X 0208 or 0213 (double byte) - // get next byte - if(!this->instream->get(tempc)) continue; - nextc = (uint8_t)tempc; - - if((this->is_big_endian && jisx_version(&thisc, &nextc)) || jisx_version(&nextc, &thisc)) { - if(this_str.address < 0) this_str.address = this->instream->tellg(); - this_str.data.push_back(thisc); - this_str.data.push_back(nextc); - continue; - } - else { - // push the read pointer back a byte - this->instream->unget(); - } - - // hit an invalid byte - // are there enough character matches to make a string? - if(this_str.data.size() >= this->min_len) { - // add terminator to string - this_str.data.push_back('\0'); - found_strings->push_back(this_str); - } - this_str.data.clear(); - this_str.data.reserve(this->min_len); - this_str.address = -1; - - } - //do a final check if we were in the middle of a group of valid bytes - // todo - make this DRY - if(this_str.data.size() >= this->min_len) { - // add terminator to string - this_str.data.push_back('\0'); - found_strings->push_back(this_str); - } - this_str.data.clear(); - this_str.data.reserve(this->min_len); - this_str.address = -1; - - return found_strings; -} - - // JIS X 0201 - 8-bit characters (including 7-bit ASCII) - // excludes non-printable (control code) and reserved bytes - bool shift_jis::is_jisx0201_printable(const uint8_t* c) { - if((*c >= 0x20) & (*c <= 0x7e) || - (*c >= 0xa1) & (*c <= 0xdf)) - return true; - return false; - } - - // JIS X 0208 - 16-bit characters -bool shift_jis::is_jisx0208(const uint8_t* c_h, const uint8_t* c_l) { - - if(!shift_jis::is_byte2_valid(c_l)) - return false; - - // Partial ku (excluding 0x7f) - // 0x81 - 0x40 to 0xac, 0xb8 to 0xbf, 0xc8 to 0xce, 0xda to 0xe8, 0xf0 to 0xf7, 0xfc - // 0x82 - 0x4f to 0x58, 0x60 to 0x79, 0x81 to 0x9a, 0x9f to 0xf1 - // 0x83 - 0x40 to 0x96, 0x9f to 0xb6, 0xbf to 0xd6 - // 0x84 - 0x40 to 0x60, 0x70 to 0x91, 0x9f to 0xbe - // 0x88 - 0x9f to 0xfc - // 0x98 - 0x40 to 0x72, 0x9f to 0xfc - // 0xea - 0x40 to 0xa4 - switch(*c_h) { - case 0x81: - if((*c_l >= 0x40) & (*c_l <= 0xac) || - (*c_l >= 0xb8) & (*c_l <= 0xbf) || - (*c_l >= 0xc8) & (*c_l <= 0xce) || - (*c_l >= 0xda) & (*c_l <= 0xe8) || - (*c_l >= 0xf0) & (*c_l <= 0xf7) || - (*c_l == 0xfc)) - return true; - return false; - case 0x82: - if((*c_l >= 0x4f) & (*c_l <= 0x58) || - (*c_l >= 0x60) & (*c_l <= 0x79) || - (*c_l >= 0x81) & (*c_l <= 0x9a) || - (*c_l >= 0x9f) & (*c_l <= 0xf1)) - return true; - return false; - case 0x83: - if((*c_l >= 0x40) & (*c_l <= 0x96) || - (*c_l >= 0x9f) & (*c_l <= 0xb6) || - (*c_l >= 0xbf) & (*c_l <= 0xd6)) - return true; - return false; - case 0x84: - if((*c_l >= 0x40) & (*c_l <= 0x60) || - (*c_l >= 0x70) & (*c_l <= 0x91) || - (*c_l >= 0x9f) & (*c_l <= 0xbe)) - return true; - return false; - case 0x88: - if((*c_l >= 0x9f) & (*c_l <= 0xfc)) - return true; - return false; - case 0x98: - if((*c_l >= 0x40) & (*c_l <= 0x72) || - (*c_l >= 0x9f) & (*c_l <= 0xfc)) - return true; - return false; - case 0xea: - if((*c_l >= 0x40) & (*c_l <= 0xa4)) - return true; - return false; - } - // Full ku (0x40 to 0xfc, excluding 0x7f) - // 0x89 to 0x97, 0x99 to 0xe9 - if((((*c_h >= 0x89) & (*c_h <= 0x97)) || - ((*c_h >= 0x99) & (*c_h <= 0xe9))) && - ((*c_l >= 0x40) & (*c_l <= 0xfc))) - return true; - return false; -} - -// JIS X 0213 - 16-bit characters -bool shift_jis::is_jisx0213(const uint8_t* c_h, const uint8_t* c_l) { - - if(!shift_jis::is_byte2_valid(c_l)) - return false; - - // Partial ku (excluding 0x7f) - // 0x84 - 0x40 to 0xdc, 0xe5 to 0xfa - // 0x86 - 0x40 to 0xf1, 0xfb to 0xfc - // 0x87 - 0x40 to 0x76, 0x7e to 0x8f, 0x93, 0x98 to 0x99, 0x9d to 0xfc - // 0xfc - 0x40 to 0xf4 - switch(*c_h) { - case 0x84: - if((*c_l >= 0x40) & (*c_l <= 0xdc) || - (*c_l >= 0xe5) & (*c_l <= 0xfa)) - return true; - return false; - case 0x86: - if((*c_l >= 0x40) & (*c_l <= 0xf1) || - (*c_l >= 0xfb) & (*c_l <= 0xfc)) - return true; - return false; - case 0x87: - if((*c_l >= 0x40) & (*c_l <= 0x76) || - (*c_l >= 0x7e) & (*c_l <= 0x8f) || - (*c_l == 0x93) || - (*c_l >= 0x98) & (*c_l <= 0x99) || - (*c_l >= 0x9d) & (*c_l <= 0xfc)) - return true; - return false; - case 0xfc: - if((*c_l >= 0x40) & (*c_l <= 0xf4)) - return true; - return false; - } - - // Full ku (0x40 to 0xfc, excluding 0x7f) - // 0x81 to 0x83, 0x85, 0x88 to 0xfb - if((((*c_h >= 0x81) & (*c_h <= 0x83)) || - (*c_h == 0x85) || - ((*c_h >= 0x88) & (*c_h <= 0xfb))) && - ((*c_l >= 0x40) & (*c_l <= 0xfc))) - return true; - return false; -}