diff --git a/A_John.txt b/A_John.txt new file mode 100644 index 0000000..dd813da --- /dev/null +++ b/A_John.txt @@ -0,0 +1,2 @@ +Believe in yourself and all that you are. +Know that there is something inside you that is greater than any obstacle. \ No newline at end of file diff --git a/README.md b/README.md index 3076250..f553470 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,6 @@ $ Plagiarism-checker-Python-> python3 app.py ('fatma.txt', 'juma.txt', 0.18643448370323362) ``` - ## A Python Library? Would you like to use a Python library instead to help you compare strings and documents without spending time writing the vectorizers by yourself, then take a look at [Pysimilar](https://github.com/Kalebu/pysimilar). @@ -57,7 +56,7 @@ you can raise an issue. ## Pull Requests -If you have something to add, I welcome pull requests on improvement; your helpful contribution will be merged as soon as possible. +If you have something to add, I welcome pull requests on improvement; your helpful contribution will be merged as soon as possible. ## Give it a Star diff --git a/app.py b/app.py index 7a6f452..6f51cb2 100644 --- a/app.py +++ b/app.py @@ -15,8 +15,9 @@ def similarity(doc1, doc2): return cosine_similarity([doc1, doc2]) s_vectors = list(zip(student_files, vectors)) plagiarism_results = set() +# threshold for the similarity score -def check_plagiarism(): +def check_plagiarism(threshold=0.8): global s_vectors for student_a, text_vector_a in s_vectors: new_vectors = s_vectors.copy() @@ -24,11 +25,11 @@ def check_plagiarism(): del new_vectors[current_index] for student_b, text_vector_b in new_vectors: sim_score = similarity(text_vector_a, text_vector_b)[0][1] - student_pair = sorted((student_a, student_b)) - score = (student_pair[0], student_pair[1], sim_score) - plagiarism_results.add(score) + if sim_score > threshold: + student_pair = sorted((student_a, student_b)) + score = (student_pair[0], student_pair[1], sim_score) + plagiarism_results.add(score) return plagiarism_results - for data in check_plagiarism(): print(data)