Authors-Graph/main.py at main · Prolab-Project/Authors-Graph · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
from flask import Flask, jsonify, request, render_template
import pandas as pd
import sys
import json

def parse_coauthors(coauthor_str):
    if pd.isna(coauthor_str):
        return []
    coauthor_str = coauthor_str.strip("[]")
    coauthor_list = [x.strip().strip("'").strip('"').lower() for x in coauthor_str.split(",")]
    return coauthor_list

def clean_connections(graph, data):
    orcid_to_names = {}
    for _, row in data.iterrows():
        orcid = row["orcid"].lower()
        name = row["author_name"].strip().lower()
        if orcid not in orcid_to_names:
            orcid_to_names[orcid] = set()
        orcid_to_names[orcid].add(name)

    for orcid, node_data in graph.nodes.items():
        if not orcid.startswith("generated"):
            valid_names = orcid_to_names.get(orcid, set())
            node_data["connections"] = [
                conn for conn in node_data["connections"]
                if graph.nodes[conn]["name"] not in valid_names
            ]


class Graph:
    def __init__(self):
        self.nodes = {}
        self.edges = {}

    def addNode(self, orcid, author_name):
        if orcid not in self.nodes:
            self.nodes[orcid] = {
                "name": author_name,
                "connections": [],
                "papers": []
            }

    def addPaper(self, orcid, paper_title):
        if orcid in self.nodes and paper_title not in self.nodes[orcid]["papers"]:
            self.nodes[orcid]["papers"].append(paper_title)

    def addEdges(self, orcid_1, orcid_2, weight=1):
        if orcid_1 != orcid_2:
            if orcid_1 in self.nodes and orcid_2 in self.nodes:

                if orcid_1 == orcid_2 and self.nodes[orcid_1]["name"] != self.nodes[orcid_2]["name"]:
                    return
                if self.nodes[orcid_1]["name"] == self.nodes[orcid_2]["name"]:
                    return
                edge = (min(orcid_1, orcid_2), max(orcid_1, orcid_2))
                if edge in self.edges:
                    self.edges[edge] += weight
                else:
                    self.edges[edge] = weight
                    self.nodes[orcid_1]["connections"].append(orcid_2)
                    self.nodes[orcid_2]["connections"].append(orcid_1)


    def writeJsonManual(self, output_file="graph_output.json"):
        import json

        graph_data = {
            "nodes": [],
            "edges": []
        }


        for node_id, node_data in self.nodes.items():
            node_entry = {
                "orcid": node_id,
                "name": node_data["name"],
                "connections": [self.nodes[conn]["name"] for conn in node_data["connections"]]
            }

            if not node_id.startswith("generated"):
                node_entry["papers"] = node_data["papers"]

            graph_data["nodes"].append(node_entry)


        for edge, weight in self.edges.items():
            edge_entry = {
                "edge": list(edge),
                "weight": weight
            }
            graph_data["edges"].append(edge_entry)


        with open(output_file, "w", encoding="utf-8") as file:
            json.dump(graph_data, file, ensure_ascii=False, indent=4)

        print(f"Graph written to JSON file: {output_file}")


def getNodes(self):
        return self.nodes

def get_outgoing_edges(self, node):
        if node in self.nodes:
            return self.nodes[node]["connections"]
        return []


def value(self, from_node, to_node):
        edge = (min(from_node, to_node), max(from_node, to_node))
        return self.edges.get(edge, float('inf'))

def dijkstra(Graph, start_node):
    unvisited_nodes = list(Graph.getNodes().keys())
    shortest_path = {}
    previous_nodes = {}
    max_value = sys.maxsize

    for node in unvisited_nodes:
        shortest_path[node] = max_value
    shortest_path[start_node] = 0

    while unvisited_nodes:
        current_min_node = None
        for node in unvisited_nodes:
            if current_min_node is None or shortest_path[node] < shortest_path[current_min_node]:
                current_min_node = node

        neighbors = Graph.get_outgoing_edges(current_min_node)
        for neighbor in neighbors:
            temp_value = shortest_path[current_min_node] + Graph.value(current_min_node, neighbor)
            if temp_value < shortest_path[neighbor]:
                shortest_path[neighbor] = temp_value
                previous_nodes[neighbor] = current_min_node

        unvisited_nodes.remove(current_min_node)

    return previous_nodes, shortest_path

def create_priority_queue_manual(graph, start_id):
    """
    A yazarı ve işbirliği yaptığı yazarlar için düğüm ağırlıklarına göre kuyruk oluşturur.
    Ağırlık, her bir yazarın toplam işbirliği sayısını ifade eder.
    Kuyruk elle sıralanır (Python listesi kullanılarak).
    """
    if start_id not in graph.getNodes():
        print(f"No such ORCID {start_id} exists in the graph.")
        return []


    priority_queue = []
    visited = set()

    # A yazarını kuyruğa ekle
    start_connections = len(graph.getNodes()[start_id]["connections"])
    priority_queue.append((start_connections, start_id))
    visited.add(start_id)

    # A yazarı ile işbirliği yapanları kuyruğa ekle
    for neighbor in graph.get_outgoing_edges(start_id):
        if neighbor not in visited:
            neighbor_connections = len(graph.getNodes()[neighbor]["connections"])
            priority_queue.append((neighbor_connections, neighbor))
            visited.add(neighbor)


    for i in range(len(priority_queue)):
        for j in range(i + 1, len(priority_queue)):
            if priority_queue[i][0] < priority_queue[j][0]:  # Büyük ağırlık önce gelmeli
                priority_queue[i], priority_queue[j] = priority_queue[j], priority_queue[i]

    return priority_queue

def print_priority_queue_manual(priority_queue, graph):
    """
    Kuyruğu ekrana yazdırır.
    """
    print("\nPriority Queue (Yazarlar ve Ağırlıkları):")
    for weight, author_id in priority_queue:
        author_name = graph.getNodes()[author_id]["name"]
        print(f"Yazar: {author_name} (ORCID: {author_id}), İşbirliği Sayısı: {weight}")


def find_shortest_path(graph, start_id, end_id):
    previous_nodes, shortest_path = dijkstra(graph, start_id)
    path = []
    current_node = end_id
    while current_node != start_id:
        path.append(current_node)
        current_node = previous_nodes.get(current_node)
        if current_node is None:
            return None, float('inf')
    path.append(start_id)
    path.reverse()
    return path, shortest_path[end_id]

def find_max_connection(graph):
    max_connections = 0
    most_connected_author = None

    for orcid , node_data in graph.getNodes().items() :
        connection_count = len (node_data["connections"])
        if connection_count > max_connections :
            max_connections = connection_count
            most_connected_author = orcid
    return most_connected_author, max_connections

def find_connection_count(graph, countId) :
    for orcid , node_data in graph.getNodes().items() :
        if orcid == countId :
            count_connection = len (node_data["connections"])
            return count_connection
    return None

def find_longest_path(graph, start_node):
    visited = set()
    longest_path = []
    def dfs(current_node, path):
        nonlocal longest_path
        visited.add(current_node)
        path.append(current_node)


        if len(path) > len(longest_path):
            longest_path = path[:]

        for neighbor in graph.get_outgoing_edges(current_node):
            if neighbor not in visited:
                dfs(neighbor, path)

        path.pop()
        visited.remove(current_node)
    dfs(start_node, [])
    return longest_path
file_path = 'data/dataset.xlsx'
data = pd.read_excel(file_path)


author_papers = {}
for _, row in data.iterrows():
    if pd.notna(row["orcid"]) and pd.notna(row["paper_title"]):
        orcid = row["orcid"].lower()
        if orcid not in author_papers:
            author_papers[orcid] = []
        if row["paper_title"] not in author_papers[orcid]:
            author_papers[orcid].append(row["paper_title"])
unique_authors = data[["author_name", "orcid", "paper_title"]].dropna().drop_duplicates()
author_id_map = {row.orcid.lower(): row.author_name.lower() for _, row in unique_authors.iterrows()}

all_coauthors = set()
for coauthor_list in data["coauthors"].apply(parse_coauthors):
    all_coauthors.update(coauthor_list)

existing_authors = set(author_id_map.values())
missing_coauthors = all_coauthors - existing_authors

def generate_deterministic_id(author_name):
    total = 0
    for i, char in enumerate(author_name):
        total += (i + 1) * ord(char)
    return f"generated-{total % 1000000}"

for coauthor in missing_coauthors:
    deterministic_id = generate_deterministic_id(coauthor)
    author_id_map[deterministic_id] = coauthor

authorGraph = Graph()
for orcid, author_name in author_id_map.items():
    authorGraph.addNode(orcid, author_name)
    if orcid in author_papers:
        for paper in author_papers[orcid]:
            authorGraph.addPaper(orcid, paper)

data["author_orcid"] = data["orcid"].str.lower()
data["coauthors"] = data["coauthors"].apply(parse_coauthors)

for _, row in data.iterrows():
    coauthors = row["coauthors"]
    for coauthor in coauthors:
        coauthor_orcid = next((k for k, v in author_id_map.items() if v == coauthor), None)
        if coauthor_orcid:
            authorGraph.addEdges(row["author_orcid"], coauthor_orcid)

clean_connections(authorGraph, data)

authorGraph.writeJsonManual("cleaned_graph_output.json")
print("Bağlantılardan yazarın kendi ismiyle eşleşenler temizlendi ve güncellenmiş JSON dosyasına yazıldı: cleaned_graph_output.json")


authorGraph.writeJsonManual("graph_output.json")
print("Graf JSON dosyasına yazdırıldı: graph_output.json")

start_orcid = input("Enter the start ORCID: ")
end_orcid = input("Enter the end ORCID: ")

path, distance = find_shortest_path(authorGraph, start_orcid, end_orcid)
if path is None:
    print(f"\n\nThere is no path between {start_orcid} and {end_orcid}")
else:
    print(f"\n\nShortest path: {path}")
    print(f"\nTotal distance: {distance}")

most_connected_author, max_connections = find_max_connection(authorGraph)
author_name = authorGraph.getNodes()[most_connected_author]["name"]

print(f"Most connected author: {author_name} (ORCID: {most_connected_author})")
print(f"Number of connections: {max_connections}")

countId = input("Type the ID for which you want to calculate the number of connections :")
count_connection = find_connection_count(authorGraph,countId)
if count_connection is None :
    print ("Count id has no connections")
    print(f"Count id number of connections is : {count_connection}")
start_id = input("Enter the ORCID to find the longest path: ")
if start_id in authorGraph.getNodes():
    longest_path = find_longest_path(authorGraph, start_id)
    print(f"\nLongest path from {start_id}: {longest_path}")
    print(f"Number of nodes in the longest path: {len(longest_path)}")
else:
    print(f"Count id number of connections is : {count_connection}")
    print(f"No such ORCID {start_id} exists in the graph.")


author_id = input("dugum olusturmak icin ORCID id giriniz: ")
if author_id in authorGraph.getNodes():
    priority_queue = create_priority_queue_manual(authorGraph, author_id)
    print_priority_queue_manual(priority_queue, authorGraph)
else:
    print(f"No such ORCID {author_id} exists in the graph.")


author_id = input("dugum olusturmak icin ORCID id giriniz: ")
if author_id in authorGraph.getNodes():
    priority_queue = create_priority_queue_manual(authorGraph, author_id)
    print_priority_queue_manual(priority_queue, authorGraph)
else:
    print(f"No such ORCID {author_id} exists in the graph")