# In case of web_1442, the index in the raw data does not fit perfectly.
# There are 332 pages in total, but the max index value is 335 in the raw data.
# We use dictionary in order to reorganize the page index.

f_o = open('../web_data/web_1442.txt','w')
f_d = open('web_1442_domain.txt','w')
f_a = open('web_1442_action.txt','w')

page_index = {}
page_cnt = 0

id2dom = {}

f_i = open('raw_1442.txt','r')
while True:
	line = f_i.readline()
	if not line: break
	term = line.split(',')
	len_term = len(term)
	link_type = term[len_term-1]
	src_url_id = int(term[len_term-7][1:len(term[len_term-7])-1])
	dst_url_id = int(term[len_term-4][1:len(term[len_term-4])-1])
	src_dom_id = int(term[len_term-5][1:len(term[len_term-5])-1])
	dst_dom_id = int(term[len_term-2][1:len(term[len_term-2])-1])
	if not (src_url_id in page_index):
		page_index[src_url_id] = page_cnt
		page_cnt = page_cnt + 1
	src_url_id = page_index[src_url_id]
	if not (dst_url_id in page_index):
		page_index[dst_url_id] = page_cnt
		page_cnt = page_cnt + 1
	dst_url_id = page_index[dst_url_id]
	f_o.write(str(src_url_id) + ',' + str(dst_url_id) + ',')
	if link_type == '"n"\n':
		f_o.write('n\n')
	elif link_type  == '"s"\n':
		f_o.write('s\n')
	else:
		f_o.write('a\n')
	id2dom[src_url_id] = src_dom_id
	id2dom[dst_url_id] = dst_dom_id
	anchor = term[3].lower()
	if link_type == '"a"\n':
		print(anchor)
	if ('e-mail' in anchor) or ('edit' in anchor) or ('share' in anchor) or ('vote' in anchor):
		f_a.write(str(src_url_id) + '	' + str(dst_url_id) + '\n')
	
for u, d in id2dom.items():
	f_d.write(str(u) + '	' + str(d) + '\n')

f_a.close()
f_d.close()
f_i.close()
f_o.close()
