diff --git a/P/019_Plagiarism_Checker.ipynb b/P/019_Plagiarism_Checker.ipynb new file mode 100644 index 00000000..9f95d77f --- /dev/null +++ b/P/019_Plagiarism_Checker.ipynb @@ -0,0 +1,156 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "All the IPython Notebooks in **Python Mini-Projects** series by Dr. Milaan Parmar are available @ **[GitHub](https://github.com/milaan9/91_Python_Mini_Projects)**\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Python Program to Check Plagiarism " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2021-10-05T13:06:11.749110Z", + "start_time": "2021-10-05T13:06:09.467318Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Similarity data:\n", + " ('Ben.txt', 'Clark.txt', 0.408904884400347)\n", + "Similarity data:\n", + " ('Arthur.txt', 'Clark.txt', 0.5430431121089816)\n", + "Similarity data:\n", + " ('Arthur.txt', 'Ben.txt', 0.4595329317649595)\n" + ] + } + ], + "source": [ + "'''\n", + "Python Program to Check Plagiarism \n", + "'''\n", + "\n", + "# Import necessary modules!\n", + "import os\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "\n", + "student_files = [doc for doc in os.listdir() if doc.endswith('.txt')]\n", + "student_notes = [open(_file, encoding='utf-8').read()\n", + " for _file in student_files]\n", + "\n", + "\n", + "def vectorize(Text): return TfidfVectorizer().fit_transform(Text).toarray()\n", + "def similarity(doc1, doc2): return cosine_similarity([doc1, doc2])\n", + "\n", + "\n", + "vectors = vectorize(student_notes)\n", + "s_vectors = list(zip(student_files, vectors))\n", + "plagiarism_results = set()\n", + "\n", + "\n", + "def check_plagiarism():\n", + " global s_vectors\n", + " for student_a, text_vector_a in s_vectors:\n", + " new_vectors = s_vectors.copy()\n", + " current_index = new_vectors.index((student_a, text_vector_a))\n", + " del new_vectors[current_index]\n", + " for student_b, text_vector_b in new_vectors:\n", + " sim_score = similarity(text_vector_a, text_vector_b)[0][1]\n", + " student_pair = sorted((student_a, student_b))\n", + " score = (student_pair[0], student_pair[1], sim_score)\n", + " plagiarism_results.add(score)\n", + " return plagiarism_results\n", + "\n", + "\n", + "for data in check_plagiarism():\n", + " print(\"Similarity data:\\n\", data) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "hide_input": false, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/P/Arthur.txt b/P/Arthur.txt new file mode 100644 index 00000000..9fa79b09 --- /dev/null +++ b/P/Arthur.txt @@ -0,0 +1 @@ +Success can mean a variety of different things. Success is, quite simply, the accomplishment of a predetermined goal. To some people it could mean making money, cultivate and develop certain basic qualities, to others it could mean keeping everyone happy, but to me, it means achieving the goals and objective I have set for myself for my life. Besides working on your goals that would lead a person towards success it is very important to push your limit every day, take charge of your life, and keep learning. This experience enables us to think smartly to solve a critical problem and achieve success. It is very important to take care of your mind which could be done by eliminating negative thoughts and negative people from your life. I think in order to call something successful, both the result and the process should be great. Without success, you, the group, your company, your goals, dreams and even entire civilizations cease to survive. \ No newline at end of file diff --git a/P/Ben.txt b/P/Ben.txt new file mode 100644 index 00000000..f86f3729 --- /dev/null +++ b/P/Ben.txt @@ -0,0 +1 @@ +In order to be successful one needs cultivate and develop certain basic qualities. Success is, quite simply, the accomplishment of a predetermined goal. We consciously or subconsciously set goals for ourselves all the time. First of all, you must know aim and objective of your life. Unless you know your destination, you cannot set out on a journey. At first, you must by very clear in your objectives to be achieved. Finally, you should enjoy the overall process rather than the final outcome. There is no shortcut to success. Hard work is the only key to achieving it; it teaches us discipline, dedication and determination. There is no single right way to be successful. What works for you might not work for someone else. \ No newline at end of file diff --git a/P/Clark.txt b/P/Clark.txt new file mode 100644 index 00000000..f754a3c8 --- /dev/null +++ b/P/Clark.txt @@ -0,0 +1 @@ +Success is, quite simply, the accomplishment of a predetermined goal. Success is considered to be a term that describes two things. It also the combination of variety of different things. The first one is achievement of a certain major or minor goal. This could be succeeding in making a delicious dinner, or a more global thing succeeding in a career or job. The second definition of success is more broad and subjective. Success provides confidence, security, a sense of well-being, the ability to contribute at a greater level, hope and leadership. This experience enables us to think smartly to solve a critical problem and achieve success. Without success, you, the group, your company, your goals, dreams and even entire civilizations cease to survive. \ No newline at end of file diff --git a/P/README.md b/P/README.md new file mode 100644 index 00000000..ca83e675 --- /dev/null +++ b/P/README.md @@ -0,0 +1,96 @@ +

+Last Commit + +

+ + + +# Plagiarism Checker + +In this class, you'll learn how to check similarity between text (.txt) documents using cosine similarity + +In order to compute the simlilarity between on two text documents, the textual raw data is transformed into vectors ➑ **arrays of numbers** and then from that we are going to use a basic knowledge vector to compute the the similarity between them. + + +

+ +

+ +## Prerequisites: + +1. Python Basics +2. sklearn module + +--- + +## Install Necessary Modules: + +Open your [![Anaconda](https://img.shields.io/badge/Anaconda-342B029.svg?&style=flate&logo=anaconda&logoColor=white)](https://www.anaconda.com/products/individual) Prompt propmt and type and run the following command (individually): + + - pip install sklearn + +**[`Sklearn`](https://scikit-learn.org/)** (Scikit-learn) is the most useful and robust library for machine learning in Python. It provides a selection of efficient tools for machine learning and statistical modeling including classification, regression, clustering and dimensionality reduction via a consistence interface in Python. This library, which is largely written in Python, is built upon NumPy, SciPy and Matplotlib. + +Once Installed now we can import it inside our python code. + +--- + +## Frequently asked questions ❔ + +### How can I thank you for writing and sharing this tutorial? 🌷 + +You can Star Badge and Fork Badge Starring and Forking is free for you, but it tells me and other people that it was helpful and you like this tutorial. + +Go [**`here`**](https://github.com/milaan9/91_Python_Mini_Projects) if you aren't here already and click ➞ **`✰ Star`** and **`β΅– Fork`** button in the top right corner. You will be asked to create a GitHub account if you don't already have one. + +--- + +### How can I read this tutorial without an Internet connection? GIF + +1. Go [**`here`**](https://github.com/milaan9/91_Python_Mini_Projects) and click the big green ➞ **`Code`** button in the top right of the page, then click ➞ [**`Download ZIP`**](https://github.com/milaan9/91_Python_Mini_Projects/archive/refs/heads/main.zip). + + ![Download ZIP](https://github.com/milaan9/91_Python_Mini_Projects/blob/main/img/dnld_rep.png) + +2. Extract the ZIP and open it. Unfortunately I don't have any more specific instructions because how exactly this is done depends on which operating system you run. + +3. Launch ipython notebook from the folder which contains the notebooks. Open each one of them + + `Kernel > Restart & Clear Output` + +This will clear all the outputs and now you can understand each statement and learn interactively. + +If you have git and you know how to use it, you can also clone the repository instead of downloading a zip and extracting it. An advantage with doing it this way is that you don't need to download the whole tutorial again to get the latest version of it, all you need to do is to pull with git and run ipython notebook again. + +--- + +## Authors ✍️ + +I'm Dr. Milaan Parmar and I have written this tutorial. If you think you can add/correct/edit and enhance this tutorial you are most welcomeπŸ™ + +See [github's contributors page](https://github.com/milaan9/91_Python_Mini_Projects/graphs/contributors) for details. + +If you have trouble with this tutorial please tell me about it by [Create an issue on GitHub](https://github.com/milaan9/91_Python_Mini_Projects/issues/new). and I'll make this tutorial better. This is probably the best choice if you had trouble following the tutorial, and something in it should be explained better. You will be asked to create a GitHub account if you don't already have one. + +If you like this tutorial, please [give it a ⭐ star](https://github.com/milaan9/91_Python_Mini_Projects). + +--- + +## Licence πŸ“œ + +You may use this tutorial freely at your own risk. See [LICENSE](https://github.com/milaan9/91_Python_Mini_Projects/blob/main/LICENSE). + +Copyright (c) 2020 Dr. Milaan Parmar + +--- + +
+

Connect with me +

+

+ LinkedIn + Instagram + Facebook + Gmail +

+ + diff --git a/P/output.png b/P/output.png new file mode 100644 index 00000000..e417edb8 Binary files /dev/null and b/P/output.png differ