# %% [markdown]
# # notebook to generate automatic emails to contact all PIK authors with pending publications
# 
# 1. from https://publications.pik-potsdam.de/pubman/faces/HomePage.jsp get curl.txt file of advanced search (manual search query under Identifier: MDB_ID: pending)
# 2. convert curl to python code via https://curlconverter.com/python/
# 3. extract a dictionary `pending_pubs_by_author`, which list the responsible PIK authors with their pending publications.
# 4. generate an automatic email from the information in `pending_pubs_by_author` for each responsible author.
# 5. Send the emails automatically using a script.  

# %%
import requests
import json
from email import policy
from email.message import EmailMessage

# The Python script below is using the `requests` library to send a `POST` request to the URL `https://publications.pik-potsdam.de/rest/items/search`. 
# The response from the server is stored in the `response` variable. 

# generated with https://curlconverter.com/python/
# based on https://publications.pik-potsdam.de/pubman/faces/SearchResultListPage.jsp

headers = {
    'Cache-Control': 'no-cache',
    'Content-Type': 'application/json',
}

params = {
    'format': 'json',
}

json_data = {
    'query': {
        'bool': {
            'must': [
                {
                    'term': {
                        'publicState': {
                            'value': 'RELEASED',
                            'boost': 1.0,
                        },
                    },
                },
                {
                    'term': {
                        'versionState': {
                            'value': 'RELEASED',
                            'boost': 1.0,
                        },
                    },
                },
                {
                    'bool': {
                        'should': [
                            {
                                'nested': {
                                    'query': {
                                        'bool': {
                                            'must': [
                                                {
                                                    'term': {
                                                        'metadata.identifiers.type': {
                                                            'value': 'MDB_ID',
                                                            'boost': 1.0,
                                                        },
                                                    },
                                                },
                                                {
                                                    'match_phrase': {
                                                        'metadata.identifiers.id': {
                                                            'query': 'pending',
                                                            'slop': 0,
                                                            'boost': 1.0,
                                                        },
                                                    },
                                                },
                                            ],
                                            'adjust_pure_negative': True,
                                            'boost': 1.0,
                                        },
                                    },
                                    'path': 'metadata.identifiers',
                                    'ignore_unmapped': False,
                                    'score_mode': 'avg',
                                    'boost': 1.0,
                                },
                            },
                            {
                                'nested': {
                                    'query': {
                                        'bool': {
                                            'must': [
                                                {
                                                    'term': {
                                                        'metadata.sources.identifiers.type': {
                                                            'value': 'MDB_ID',
                                                            'boost': 1.0,
                                                        },
                                                    },
                                                },
                                                {
                                                    'match_phrase': {
                                                        'metadata.sources.identifiers.id': {
                                                            'query': 'pending',
                                                            'slop': 0,
                                                            'boost': 1.0,
                                                        },
                                                    },
                                                },
                                            ],
                                            'adjust_pure_negative': True,
                                            'boost': 1.0,
                                        },
                                    },
                                    'path': 'metadata.sources.identifiers',
                                    'ignore_unmapped': False,
                                    'score_mode': 'avg',
                                    'boost': 1.0,
                                },
                            },
                        ],
                        'adjust_pure_negative': True,
                        'boost': 1.0,
                    },
                },
            ],
            'adjust_pure_negative': True,
            'boost': 1.0,
        },
    },
    'sort': [
        {
            'metadata.title.keyword': {
                'order': 'ASC',
            },
        },
    ],
    'size': '5000',
    'from': '0',
}

response = requests.post('https://publications.pik-potsdam.de/rest/items/search', params=params, headers=headers, json=json_data)

# Note: json_data will not be serialized by requests
# exactly as it was in the original request.
#data = '{"query" : {\n  "bool" : {\n    "must" : [ {\n      "term" : {\n        "publicState" : {\n          "value" : "RELEASED",\n          "boost" : 1.0\n        }\n      }\n    }, {\n      "term" : {\n        "versionState" : {\n          "value" : "RELEASED",\n          "boost" : 1.0\n        }\n      }\n    }, {\n      "bool" : {\n        "should" : [ {\n          "nested" : {\n            "query" : {\n              "bool" : {\n                "must" : [ {\n                  "term" : {\n                    "metadata.identifiers.type" : {\n                      "value" : "MDB_ID",\n                      "boost" : 1.0\n                    }\n                  }\n                }, {\n                  "match_phrase" : {\n                    "metadata.identifiers.id" : {\n                      "query" : "pending",\n                      "slop" : 0,\n                      "boost" : 1.0\n                    }\n                  }\n                } ],\n                "adjust_pure_negative" : true,\n                "boost" : 1.0\n              }\n            },\n            "path" : "metadata.identifiers",\n            "ignore_unmapped" : false,\n            "score_mode" : "avg",\n            "boost" : 1.0\n          }\n        }, {\n          "nested" : {\n            "query" : {\n              "bool" : {\n                "must" : [ {\n                  "term" : {\n                    "metadata.sources.identifiers.type" : {\n                      "value" : "MDB_ID",\n                      "boost" : 1.0\n                    }\n                  }\n                }, {\n                  "match_phrase" : {\n                    "metadata.sources.identifiers.id" : {\n                      "query" : "pending",\n                      "slop" : 0,\n                      "boost" : 1.0\n                    }\n                  }\n                } ],\n                "adjust_pure_negative" : true,\n                "boost" : 1.0\n              }\n            },\n            "path" : "metadata.sources.identifiers",\n            "ignore_unmapped" : false,\n            "score_mode" : "avg",\n            "boost" : 1.0\n          }\n        } ],\n        "adjust_pure_negative" : true,\n        "boost" : 1.0\n      }\n    } ],\n    "adjust_pure_negative" : true,\n    "boost" : 1.0\n  }\n},"sort" : [{"metadata.title.keyword" : {"order" : "ASC"}}],"size" : "5000","from" : "0"}'
#response = requests.post('https://publications.pik-potsdam.de/rest/items/search', params=params, headers=headers, data=data)

# %% [markdown]
# Extract the necessary informations from the json file

# %%
pending_pubs_original = json.loads(response.content)
print("number of Records:", pending_pubs_original["numberOfRecords"])

# find first authors of pending publications that are part of PIK
def get_author_id(author_id_long):
    # help function: remove all characters before the last '/' to get the author_id without the full URI
    return author_id_long[author_id_long.rfind('/')+1:]

pending_pubs_by_author = {}
for record in pending_pubs_original["records"]:
    title = record["data"]["metadata"]["title"]
    objectId = record["data"]["objectId"]
    # add a date if available
    if "datePublishedOnline" in record["data"]["metadata"]:
        date = record["data"]["metadata"]["datePublishedOnline"]
    elif "datePublishedInPrint" in record["data"]["metadata"]:
        #print("datePublishedOnline not found for record", objectId, "with title", title)
        date = record["data"]["metadata"]["datePublishedInPrint"]
    elif "dateAccepted" in record["data"]["metadata"]:
        #print("datePublishedOnline and datePublishedInPrint not found for record", objectId, "with title", title)
        date = record["data"]["metadata"]["dateAccepted"]
    else:
        print("no date found for record", objectId, "with title", title)
        date = "unknown"

    for creator in record["data"]["metadata"]["creators"]:
        # check if organizations is a key in creator["person"]
        if "organizations" not in creator["person"]:
            print("organizations is not a key in creator[\"person\"]")
            continue

        # Check if the creator is part of PIK
        if not any(org["name"] == "Potsdam Institute for Climate Impact Research" for org in creator["person"]["organizations"]):
            continue  # Skip this creator if they are not part of PIK
        
        # add the publication to the list of pending publications by the author. If the author is not already in the list, add them.
        author_id = get_author_id(creator["person"]["identifier"]["id"])
        given_name = creator["person"]["givenName"]
        family_name = creator["person"]["familyName"]
        if author_id not in pending_pubs_by_author:
            pending_pubs_by_author[author_id] = {}
            pending_pubs_by_author[author_id]["given_name"] = given_name
            pending_pubs_by_author[author_id]["family_name"] = family_name
            pending_pubs_by_author[author_id]["full_name"] = f"{given_name} {family_name}"
            pending_pubs_by_author[author_id]["publications"] = []
        pending_pubs_by_author[author_id]["publications"].append({"title": title,
                                                                  "date": date,
                                                                  "objectId": objectId, 
                                                                  "url": f"https://publications.pik-potsdam.de/pubman/faces/ViewItemOverviewPage.jsp?itemId={objectId}"})
        break # break after the first PIK author is added, then go to the next record

# sort the publications by date
for author_id, data in pending_pubs_by_author.items():
    data["publications"] = sorted(data["publications"], key=lambda x: x["date"])

# find authors with multiple pending publications for the new structure of pending_pubs_by_author
authors_with_multiple_pubs = {}
for author_id, data in pending_pubs_by_author.items():
    if len(data["publications"]) > 1:
        authors_with_multiple_pubs[author_id] = data

# show how many publications are in total pending
print("number of pending publications:", sum([len(data["publications"]) for data in pending_pubs_by_author.values()]))

# show how many authors have pending publications
print("number of authors with pending publications:", len(pending_pubs_by_author))

# show authors with multiple pending publications
print("authors with multiple pending publications:", len(authors_with_multiple_pubs))

# List the authors with the most pending publications
print("List of the authors, sorted by the number of pending publications:")
sorted_authors = sorted(pending_pubs_by_author.items(), key=lambda x: len(x[1]["publications"]), reverse=True)
for author_id, data in sorted_authors:
    print(data["given_name"], data["family_name"], ":", len(data["publications"]))

# %%
pending_pubs_original

# %% [markdown]
# ## Generate the emails
# 
# Generate a bunch of emails. One per first PIK Author. These emails should contain text like this.
# 
# > Dear (Author Name),
# >
# > Your Publication(s):
# >
# > 1. Bla bla
# > 2. Bla bla
# >
# > do not have reproducibility data recorded in the publication database.
# > Please contact your scientific coordination team as soon as possible with the required information.
# > best,
# > ...

# %%
def generate_email_address(author_id):
    # @example.com for testing purposes
    email_address = author_id + "@pik-potsdam.de" #change to @pik-potsdam.de for real email addresses
    return email_address

def generate_email(author_id, pending_pubs_by_author):
    # get the pending publications of the author
    publications = pending_pubs_by_author[author_id]["publications"]
    # get the author's full name
    author_name = pending_pubs_by_author[author_id]["full_name"]

    # generate the email text
    email_text = f"Dear {author_name},\n\nYour Publication(s):\n"

    for i, pub in enumerate(publications, start=1):
        email_text += f"{i}. {pub['title']}.\n   URL: {pub['url']}\n   Date: {pub['date']}\n"

    email_text += ("\ndo not have reproducibility data recorded in the publication database."
                   "\nPlease contact your scientific coordination team as soon as possible with the required information."
                   "\nBest,\n...\n")

    return email_text

def generate_email_file(sender_email, recipient_email, subject, body, author_id):
    # Create the email message
    msg = EmailMessage()
    msg['From'] = sender_email
    msg['To'] = recipient_email
    msg['Subject'] = subject
    msg.set_content(body)

    # Generate the .eml file
    with open(f'PendingPublications/{author_id}.eml', 'w') as eml_file:
        eml_file.write(msg.as_string(policy=policy.default))

# add the email text to each author's dictionary in pending_pubs_by_author
for author_id in pending_pubs_by_author:
    pending_pubs_by_author[author_id]['email_address'] = generate_email_address(author_id)
    pending_pubs_by_author[author_id]['email'] = generate_email(author_id, pending_pubs_by_author)

# print the first example email
for author_id, data in list(pending_pubs_by_author.items())[:1]:
    print("------------------------------------------------------------------------------------------------------------------------")
    print(f"Author ID: {author_id}\nName: {data['full_name']}\nEmail-address: {data['email_address']}\nEmail:\n\n{data['email']}\n")

# %% [markdown]
# # Write the info in an easy accessible webpage
# 
# To preview the webpage locally, use the VSCode extension 'Live Preview' from Microsoft.
# - After installing, Press `Cmd+Shift+P` to open the Command Palette, type "Live Preview: Start Server", and press Enter.
# - After starting Live Preview, your default web browser should automatically open, displaying a live preview of your Markdown file rendered as HTML.
# - Live Preview will automatically refresh the browser view as you make changes to your Markdown file in VS Code, allowing you to see updates in real-time.

# %%
# write the list of authors with pending publications to a markdown file
author_links = ""
for author_id in pending_pubs_by_author.keys():
    # generate the .eml file and store it in the PendingPublications/eml_files directory
    generate_email_file("", pending_pubs_by_author[author_id]['email_address'], "Missing Reproducibility Data", pending_pubs_by_author[author_id]['email'], author_id)

    # write the markdown file
    author_links = author_links + f"* [ {pending_pubs_by_author[author_id]['full_name']} ]({author_id}.md)\n\n"
    with open(f"PendingPublications/{author_id}.md", 'w') as f:
        # title
        f.write(f"# {pending_pubs_by_author[author_id]['full_name']}\n")
        # author information
        f.write("Author ID: " + author_id + "<br>")
        f.write(f"Email: {pending_pubs_by_author[author_id]['email_address']}\n\n")
        f.write(f"### Pending Publications:\n")
        for i, pub in enumerate(pending_pubs_by_author[author_id]["publications"], start=1):
            f.write(f"{i}. [{pub['title']}]({pub['url']})\n")
        f.write("\n")
        # section for email
        f.write(f"### Email:\n")
        # link to download the .eml file
        f.write(f'<a href="{author_id}.eml" >Download the .eml file</a>\n\n')
        # email text
        f.write("\n---------------------------------------------------------------------------------\n")
        f.write(pending_pubs_by_author[author_id]['email'])
        f.write("\n---------------------------------------------------------------------------------\n")

# write the website file with the list of authors as links to home.md
with open(f"PendingPublications/home.md", 'w') as f:
    f.write("# PIK authors with missing reproducibility data:\n")
    # section for tests
    if False:
        f.write("[.txt file doesn't work ](test.txt)\n\n") # does not work 
        f.write("[.eml file doesn't work ](test.eml)\n\n") # does not work 
        f.write("[.md file works](test.md)\n\n") # for markdown files it works
        f.write('<a href="test.eml" download>Download the test file</a>\n\n') #works for all files
    # add the list of authors
    f.write(author_links)

# %%



