|
|
|
|
|
|
|
|
|
from tree_sitter import Language, Parser |
|
import pdb |
|
|
|
import re |
|
from io import StringIO |
|
import tokenize |
|
def remove_comments_and_docstrings(source,lang): |
|
if lang in ['python']: |
|
""" |
|
Returns 'source' minus comments and docstrings. |
|
""" |
|
io_obj = StringIO(source) |
|
out = "" |
|
prev_toktype = tokenize.INDENT |
|
last_lineno = -1 |
|
last_col = 0 |
|
for tok in tokenize.generate_tokens(io_obj.readline): |
|
token_type = tok[0] |
|
token_string = tok[1] |
|
start_line, start_col = tok[2] |
|
end_line, end_col = tok[3] |
|
ltext = tok[4] |
|
if start_line > last_lineno: |
|
last_col = 0 |
|
if start_col > last_col: |
|
out += (" " * (start_col - last_col)) |
|
|
|
if token_type == tokenize.COMMENT: |
|
pass |
|
|
|
elif token_type == tokenize.STRING: |
|
if prev_toktype != tokenize.INDENT: |
|
|
|
if prev_toktype != tokenize.NEWLINE: |
|
if start_col > 0: |
|
out += token_string |
|
else: |
|
out += token_string |
|
prev_toktype = token_type |
|
last_col = end_col |
|
last_lineno = end_line |
|
temp=[] |
|
for x in out.split('\n'): |
|
if x.strip()!="": |
|
temp.append(x) |
|
return '\n'.join(temp) |
|
elif lang in ['ruby']: |
|
return source |
|
else: |
|
def replacer(match): |
|
s = match.group(0) |
|
if s.startswith('/'): |
|
return " " |
|
else: |
|
return s |
|
pattern = re.compile( |
|
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', |
|
re.DOTALL | re.MULTILINE |
|
) |
|
temp=[] |
|
for x in re.sub(pattern, replacer, source).split('\n'): |
|
if x.strip()!="": |
|
temp.append(x) |
|
return '\n'.join(temp) |
|
|
|
def tree_to_token_index(root_node): |
|
if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': |
|
return [(root_node.start_point,root_node.end_point)] |
|
else: |
|
code_tokens=[] |
|
for child in root_node.children: |
|
code_tokens+=tree_to_token_index(child) |
|
return code_tokens |
|
|
|
def tree_to_variable_index(root_node,index_to_code): |
|
if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': |
|
index=(root_node.start_point,root_node.end_point) |
|
_,code=index_to_code[index] |
|
if root_node.type!=code: |
|
return [(root_node.start_point,root_node.end_point)] |
|
else: |
|
return [] |
|
else: |
|
code_tokens=[] |
|
for child in root_node.children: |
|
code_tokens+=tree_to_variable_index(child,index_to_code) |
|
return code_tokens |
|
|
|
def index_to_code_token(index,code): |
|
start_point=index[0] |
|
end_point=index[1] |
|
if start_point[0]==end_point[0]: |
|
s=code[start_point[0]][start_point[1]:end_point[1]] |
|
else: |
|
s="" |
|
s+=code[start_point[0]][start_point[1]:] |
|
for i in range(start_point[0]+1,end_point[0]): |
|
s+=code[i] |
|
s+=code[end_point[0]][:end_point[1]] |
|
return s |
|
|
|
|
|
def DFG_python(root_node,index_to_code,states): |
|
assignment=['assignment','augmented_assignment','for_in_clause'] |
|
if_statement=['if_statement'] |
|
for_statement=['for_statement'] |
|
while_statement=['while_statement'] |
|
do_first_statement=['for_in_clause'] |
|
def_statement=['default_parameter'] |
|
states=states.copy() |
|
if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': |
|
idx,code=index_to_code[(root_node.start_point,root_node.end_point)] |
|
if root_node.type==code: |
|
return [],states |
|
elif code in states: |
|
return [(code,idx,'comesFrom',[code],states[code].copy())],states |
|
else: |
|
if root_node.type=='identifier': |
|
states[code]=[idx] |
|
return [(code,idx,'comesFrom',[],[])],states |
|
elif root_node.type in def_statement: |
|
name=root_node.child_by_field_name('name') |
|
value=root_node.child_by_field_name('value') |
|
DFG=[] |
|
if value is None: |
|
indexs=tree_to_variable_index(name,index_to_code) |
|
for index in indexs: |
|
idx,code=index_to_code[index] |
|
DFG.append((code,idx,'comesFrom',[],[])) |
|
states[code]=[idx] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
else: |
|
name_indexs=tree_to_variable_index(name,index_to_code) |
|
value_indexs=tree_to_variable_index(value,index_to_code) |
|
temp,states=DFG_python(value,index_to_code,states) |
|
DFG+=temp |
|
for index1 in name_indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in value_indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'comesFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in assignment: |
|
if root_node.type=='for_in_clause': |
|
right_nodes=[root_node.children[-1]] |
|
left_nodes=[root_node.child_by_field_name('left')] |
|
else: |
|
if root_node.child_by_field_name('right') is None: |
|
return [],states |
|
left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=','] |
|
right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=','] |
|
if len(right_nodes)!=len(left_nodes): |
|
left_nodes=[root_node.child_by_field_name('left')] |
|
right_nodes=[root_node.child_by_field_name('right')] |
|
if len(left_nodes)==0: |
|
left_nodes=[root_node.child_by_field_name('left')] |
|
if len(right_nodes)==0: |
|
right_nodes=[root_node.child_by_field_name('right')] |
|
DFG=[] |
|
for node in right_nodes: |
|
temp,states=DFG_python(node,index_to_code,states) |
|
DFG+=temp |
|
|
|
for left_node,right_node in zip(left_nodes,right_nodes): |
|
left_tokens_index=tree_to_variable_index(left_node,index_to_code) |
|
right_tokens_index=tree_to_variable_index(right_node,index_to_code) |
|
temp=[] |
|
for token1_index in left_tokens_index: |
|
idx1,code1=index_to_code[token1_index] |
|
temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index], |
|
[index_to_code[x][0] for x in right_tokens_index])) |
|
states[code1]=[idx1] |
|
DFG+=temp |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in if_statement: |
|
DFG=[] |
|
current_states=states.copy() |
|
others_states=[] |
|
tag=False |
|
if 'else' in root_node.type: |
|
tag=True |
|
for child in root_node.children: |
|
if 'else' in child.type: |
|
tag=True |
|
if child.type not in ['elif_clause','else_clause']: |
|
temp,current_states=DFG_python(child,index_to_code,current_states) |
|
DFG+=temp |
|
else: |
|
temp,new_states=DFG_python(child,index_to_code,states) |
|
DFG+=temp |
|
others_states.append(new_states) |
|
others_states.append(current_states) |
|
if tag is False: |
|
others_states.append(states) |
|
new_states={} |
|
for dic in others_states: |
|
for key in dic: |
|
if key not in new_states: |
|
new_states[key]=dic[key].copy() |
|
else: |
|
new_states[key]+=dic[key] |
|
for key in new_states: |
|
new_states[key]=sorted(list(set(new_states[key]))) |
|
return sorted(DFG,key=lambda x:x[1]),new_states |
|
elif root_node.type in for_statement: |
|
DFG=[] |
|
for i in range(2): |
|
right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=','] |
|
left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=','] |
|
if len(right_nodes)!=len(left_nodes): |
|
left_nodes=[root_node.child_by_field_name('left')] |
|
right_nodes=[root_node.child_by_field_name('right')] |
|
if len(left_nodes)==0: |
|
left_nodes=[root_node.child_by_field_name('left')] |
|
if len(right_nodes)==0: |
|
right_nodes=[root_node.child_by_field_name('right')] |
|
for node in right_nodes: |
|
temp,states=DFG_python(node,index_to_code,states) |
|
DFG+=temp |
|
for left_node,right_node in zip(left_nodes,right_nodes): |
|
left_tokens_index=tree_to_variable_index(left_node,index_to_code) |
|
right_tokens_index=tree_to_variable_index(right_node,index_to_code) |
|
temp=[] |
|
for token1_index in left_tokens_index: |
|
idx1,code1=index_to_code[token1_index] |
|
temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index], |
|
[index_to_code[x][0] for x in right_tokens_index])) |
|
states[code1]=[idx1] |
|
DFG+=temp |
|
if root_node.children[-1].type=="block": |
|
temp,states=DFG_python(root_node.children[-1],index_to_code,states) |
|
DFG+=temp |
|
dic={} |
|
for x in DFG: |
|
if (x[0],x[1],x[2]) not in dic: |
|
dic[(x[0],x[1],x[2])]=[x[3],x[4]] |
|
else: |
|
dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) |
|
dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) |
|
DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in while_statement: |
|
DFG=[] |
|
for i in range(2): |
|
for child in root_node.children: |
|
temp,states=DFG_python(child,index_to_code,states) |
|
DFG+=temp |
|
dic={} |
|
for x in DFG: |
|
if (x[0],x[1],x[2]) not in dic: |
|
dic[(x[0],x[1],x[2])]=[x[3],x[4]] |
|
else: |
|
dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) |
|
dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) |
|
DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
else: |
|
DFG=[] |
|
for child in root_node.children: |
|
if child.type in do_first_statement: |
|
temp,states=DFG_python(child,index_to_code,states) |
|
DFG+=temp |
|
for child in root_node.children: |
|
if child.type not in do_first_statement: |
|
temp,states=DFG_python(child,index_to_code,states) |
|
DFG+=temp |
|
|
|
return sorted(DFG,key=lambda x:x[1]),states |
|
|
|
|
|
def DFG_java(root_node,index_to_code,states): |
|
assignment=['assignment_expression'] |
|
def_statement=['variable_declarator'] |
|
increment_statement=['update_expression'] |
|
if_statement=['if_statement','else'] |
|
for_statement=['for_statement'] |
|
enhanced_for_statement=['enhanced_for_statement'] |
|
while_statement=['while_statement'] |
|
do_first_statement=[] |
|
states=states.copy() |
|
if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': |
|
idx,code=index_to_code[(root_node.start_point,root_node.end_point)] |
|
if root_node.type==code: |
|
return [],states |
|
elif code in states: |
|
return [(code,idx,'comesFrom',[code],states[code].copy())],states |
|
else: |
|
if root_node.type=='identifier': |
|
states[code]=[idx] |
|
return [(code,idx,'comesFrom',[],[])],states |
|
elif root_node.type in def_statement: |
|
name=root_node.child_by_field_name('name') |
|
value=root_node.child_by_field_name('value') |
|
DFG=[] |
|
if value is None: |
|
indexs=tree_to_variable_index(name,index_to_code) |
|
for index in indexs: |
|
idx,code=index_to_code[index] |
|
DFG.append((code,idx,'comesFrom',[],[])) |
|
states[code]=[idx] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
else: |
|
name_indexs=tree_to_variable_index(name,index_to_code) |
|
value_indexs=tree_to_variable_index(value,index_to_code) |
|
temp,states=DFG_java(value,index_to_code,states) |
|
DFG+=temp |
|
for index1 in name_indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in value_indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'comesFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in assignment: |
|
left_nodes=root_node.child_by_field_name('left') |
|
right_nodes=root_node.child_by_field_name('right') |
|
DFG=[] |
|
temp,states=DFG_java(right_nodes,index_to_code,states) |
|
DFG+=temp |
|
name_indexs=tree_to_variable_index(left_nodes,index_to_code) |
|
value_indexs=tree_to_variable_index(right_nodes,index_to_code) |
|
for index1 in name_indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in value_indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in increment_statement: |
|
DFG=[] |
|
indexs=tree_to_variable_index(root_node,index_to_code) |
|
for index1 in indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in if_statement: |
|
DFG=[] |
|
current_states=states.copy() |
|
others_states=[] |
|
flag=False |
|
tag=False |
|
if 'else' in root_node.type: |
|
tag=True |
|
for child in root_node.children: |
|
if 'else' in child.type: |
|
tag=True |
|
if child.type not in if_statement and flag is False: |
|
temp,current_states=DFG_java(child,index_to_code,current_states) |
|
DFG+=temp |
|
else: |
|
flag=True |
|
temp,new_states=DFG_java(child,index_to_code,states) |
|
DFG+=temp |
|
others_states.append(new_states) |
|
others_states.append(current_states) |
|
if tag is False: |
|
others_states.append(states) |
|
new_states={} |
|
for dic in others_states: |
|
for key in dic: |
|
if key not in new_states: |
|
new_states[key]=dic[key].copy() |
|
else: |
|
new_states[key]+=dic[key] |
|
for key in new_states: |
|
new_states[key]=sorted(list(set(new_states[key]))) |
|
return sorted(DFG,key=lambda x:x[1]),new_states |
|
elif root_node.type in for_statement: |
|
DFG=[] |
|
for child in root_node.children: |
|
temp,states=DFG_java(child,index_to_code,states) |
|
DFG+=temp |
|
flag=False |
|
for child in root_node.children: |
|
if flag: |
|
temp,states=DFG_java(child,index_to_code,states) |
|
DFG+=temp |
|
elif child.type=="local_variable_declaration": |
|
flag=True |
|
dic={} |
|
for x in DFG: |
|
if (x[0],x[1],x[2]) not in dic: |
|
dic[(x[0],x[1],x[2])]=[x[3],x[4]] |
|
else: |
|
dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) |
|
dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) |
|
DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in enhanced_for_statement: |
|
name=root_node.child_by_field_name('name') |
|
value=root_node.child_by_field_name('value') |
|
body=root_node.child_by_field_name('body') |
|
DFG=[] |
|
for i in range(2): |
|
temp,states=DFG_java(value,index_to_code,states) |
|
DFG+=temp |
|
name_indexs=tree_to_variable_index(name,index_to_code) |
|
value_indexs=tree_to_variable_index(value,index_to_code) |
|
for index1 in name_indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in value_indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
temp,states=DFG_java(body,index_to_code,states) |
|
DFG+=temp |
|
dic={} |
|
for x in DFG: |
|
if (x[0],x[1],x[2]) not in dic: |
|
dic[(x[0],x[1],x[2])]=[x[3],x[4]] |
|
else: |
|
dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) |
|
dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) |
|
DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in while_statement: |
|
DFG=[] |
|
for i in range(2): |
|
for child in root_node.children: |
|
temp,states=DFG_java(child,index_to_code,states) |
|
DFG+=temp |
|
dic={} |
|
for x in DFG: |
|
if (x[0],x[1],x[2]) not in dic: |
|
dic[(x[0],x[1],x[2])]=[x[3],x[4]] |
|
else: |
|
dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) |
|
dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) |
|
DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
else: |
|
DFG=[] |
|
for child in root_node.children: |
|
if child.type in do_first_statement: |
|
temp,states=DFG_java(child,index_to_code,states) |
|
DFG+=temp |
|
for child in root_node.children: |
|
if child.type not in do_first_statement: |
|
temp,states=DFG_java(child,index_to_code,states) |
|
DFG+=temp |
|
|
|
return sorted(DFG,key=lambda x:x[1]),states |
|
|
|
def DFG_csharp(root_node,index_to_code,states): |
|
assignment=['assignment_expression'] |
|
def_statement=['variable_declarator'] |
|
increment_statement=['postfix_unary_expression'] |
|
if_statement=['if_statement','else'] |
|
for_statement=['for_statement'] |
|
enhanced_for_statement=['for_each_statement'] |
|
while_statement=['while_statement'] |
|
do_first_statement=[] |
|
states=states.copy() |
|
if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': |
|
idx,code=index_to_code[(root_node.start_point,root_node.end_point)] |
|
if root_node.type==code: |
|
return [],states |
|
elif code in states: |
|
return [(code,idx,'comesFrom',[code],states[code].copy())],states |
|
else: |
|
if root_node.type=='identifier': |
|
states[code]=[idx] |
|
return [(code,idx,'comesFrom',[],[])],states |
|
elif root_node.type in def_statement: |
|
if len(root_node.children)==2: |
|
name=root_node.children[0] |
|
value=root_node.children[1] |
|
else: |
|
name=root_node.children[0] |
|
value=None |
|
DFG=[] |
|
if value is None: |
|
indexs=tree_to_variable_index(name,index_to_code) |
|
for index in indexs: |
|
idx,code=index_to_code[index] |
|
DFG.append((code,idx,'comesFrom',[],[])) |
|
states[code]=[idx] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
else: |
|
name_indexs=tree_to_variable_index(name,index_to_code) |
|
value_indexs=tree_to_variable_index(value,index_to_code) |
|
temp,states=DFG_csharp(value,index_to_code,states) |
|
DFG+=temp |
|
for index1 in name_indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in value_indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'comesFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in assignment: |
|
left_nodes=root_node.child_by_field_name('left') |
|
right_nodes=root_node.child_by_field_name('right') |
|
DFG=[] |
|
temp,states=DFG_csharp(right_nodes,index_to_code,states) |
|
DFG+=temp |
|
name_indexs=tree_to_variable_index(left_nodes,index_to_code) |
|
value_indexs=tree_to_variable_index(right_nodes,index_to_code) |
|
for index1 in name_indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in value_indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in increment_statement: |
|
DFG=[] |
|
indexs=tree_to_variable_index(root_node,index_to_code) |
|
for index1 in indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in if_statement: |
|
DFG=[] |
|
current_states=states.copy() |
|
others_states=[] |
|
flag=False |
|
tag=False |
|
if 'else' in root_node.type: |
|
tag=True |
|
for child in root_node.children: |
|
if 'else' in child.type: |
|
tag=True |
|
if child.type not in if_statement and flag is False: |
|
temp,current_states=DFG_csharp(child,index_to_code,current_states) |
|
DFG+=temp |
|
else: |
|
flag=True |
|
temp,new_states=DFG_csharp(child,index_to_code,states) |
|
DFG+=temp |
|
others_states.append(new_states) |
|
others_states.append(current_states) |
|
if tag is False: |
|
others_states.append(states) |
|
new_states={} |
|
for dic in others_states: |
|
for key in dic: |
|
if key not in new_states: |
|
new_states[key]=dic[key].copy() |
|
else: |
|
new_states[key]+=dic[key] |
|
for key in new_states: |
|
new_states[key]=sorted(list(set(new_states[key]))) |
|
return sorted(DFG,key=lambda x:x[1]),new_states |
|
elif root_node.type in for_statement: |
|
DFG=[] |
|
for child in root_node.children: |
|
temp,states=DFG_csharp(child,index_to_code,states) |
|
DFG+=temp |
|
flag=False |
|
for child in root_node.children: |
|
if flag: |
|
temp,states=DFG_csharp(child,index_to_code,states) |
|
DFG+=temp |
|
elif child.type=="local_variable_declaration": |
|
flag=True |
|
dic={} |
|
for x in DFG: |
|
if (x[0],x[1],x[2]) not in dic: |
|
dic[(x[0],x[1],x[2])]=[x[3],x[4]] |
|
else: |
|
dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) |
|
dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) |
|
DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in enhanced_for_statement: |
|
name=root_node.child_by_field_name('left') |
|
value=root_node.child_by_field_name('right') |
|
body=root_node.child_by_field_name('body') |
|
DFG=[] |
|
for i in range(2): |
|
temp,states=DFG_csharp(value,index_to_code,states) |
|
DFG+=temp |
|
name_indexs=tree_to_variable_index(name,index_to_code) |
|
value_indexs=tree_to_variable_index(value,index_to_code) |
|
for index1 in name_indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in value_indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
temp,states=DFG_csharp(body,index_to_code,states) |
|
DFG+=temp |
|
dic={} |
|
for x in DFG: |
|
if (x[0],x[1],x[2]) not in dic: |
|
dic[(x[0],x[1],x[2])]=[x[3],x[4]] |
|
else: |
|
dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) |
|
dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) |
|
DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in while_statement: |
|
DFG=[] |
|
for i in range(2): |
|
for child in root_node.children: |
|
temp,states=DFG_csharp(child,index_to_code,states) |
|
DFG+=temp |
|
dic={} |
|
for x in DFG: |
|
if (x[0],x[1],x[2]) not in dic: |
|
dic[(x[0],x[1],x[2])]=[x[3],x[4]] |
|
else: |
|
dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) |
|
dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) |
|
DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
else: |
|
DFG=[] |
|
for child in root_node.children: |
|
if child.type in do_first_statement: |
|
temp,states=DFG_csharp(child,index_to_code,states) |
|
DFG+=temp |
|
for child in root_node.children: |
|
if child.type not in do_first_statement: |
|
temp,states=DFG_csharp(child,index_to_code,states) |
|
DFG+=temp |
|
|
|
return sorted(DFG,key=lambda x:x[1]),states |
|
|
|
|
|
|
|
|
|
def DFG_ruby(root_node,index_to_code,states): |
|
assignment=['assignment','operator_assignment'] |
|
if_statement=['if','elsif','else','unless','when'] |
|
for_statement=['for'] |
|
while_statement=['while_modifier','until'] |
|
do_first_statement=[] |
|
def_statement=['keyword_parameter'] |
|
if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': |
|
states=states.copy() |
|
idx,code=index_to_code[(root_node.start_point,root_node.end_point)] |
|
if root_node.type==code: |
|
return [],states |
|
elif code in states: |
|
return [(code,idx,'comesFrom',[code],states[code].copy())],states |
|
else: |
|
if root_node.type=='identifier': |
|
states[code]=[idx] |
|
return [(code,idx,'comesFrom',[],[])],states |
|
elif root_node.type in def_statement: |
|
name=root_node.child_by_field_name('name') |
|
value=root_node.child_by_field_name('value') |
|
DFG=[] |
|
if value is None: |
|
indexs=tree_to_variable_index(name,index_to_code) |
|
for index in indexs: |
|
idx,code=index_to_code[index] |
|
DFG.append((code,idx,'comesFrom',[],[])) |
|
states[code]=[idx] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
else: |
|
name_indexs=tree_to_variable_index(name,index_to_code) |
|
value_indexs=tree_to_variable_index(value,index_to_code) |
|
temp,states=DFG_ruby(value,index_to_code,states) |
|
DFG+=temp |
|
for index1 in name_indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in value_indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'comesFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in assignment: |
|
left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=','] |
|
right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=','] |
|
if len(right_nodes)!=len(left_nodes): |
|
left_nodes=[root_node.child_by_field_name('left')] |
|
right_nodes=[root_node.child_by_field_name('right')] |
|
if len(left_nodes)==0: |
|
left_nodes=[root_node.child_by_field_name('left')] |
|
if len(right_nodes)==0: |
|
right_nodes=[root_node.child_by_field_name('right')] |
|
if root_node.type=="operator_assignment": |
|
left_nodes=[root_node.children[0]] |
|
right_nodes=[root_node.children[-1]] |
|
|
|
DFG=[] |
|
for node in right_nodes: |
|
temp,states=DFG_ruby(node,index_to_code,states) |
|
DFG+=temp |
|
|
|
for left_node,right_node in zip(left_nodes,right_nodes): |
|
left_tokens_index=tree_to_variable_index(left_node,index_to_code) |
|
right_tokens_index=tree_to_variable_index(right_node,index_to_code) |
|
temp=[] |
|
for token1_index in left_tokens_index: |
|
idx1,code1=index_to_code[token1_index] |
|
temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index], |
|
[index_to_code[x][0] for x in right_tokens_index])) |
|
states[code1]=[idx1] |
|
DFG+=temp |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in if_statement: |
|
DFG=[] |
|
current_states=states.copy() |
|
others_states=[] |
|
tag=False |
|
if 'else' in root_node.type: |
|
tag=True |
|
for child in root_node.children: |
|
if 'else' in child.type: |
|
tag=True |
|
if child.type not in if_statement: |
|
temp,current_states=DFG_ruby(child,index_to_code,current_states) |
|
DFG+=temp |
|
else: |
|
temp,new_states=DFG_ruby(child,index_to_code,states) |
|
DFG+=temp |
|
others_states.append(new_states) |
|
others_states.append(current_states) |
|
if tag is False: |
|
others_states.append(states) |
|
new_states={} |
|
for dic in others_states: |
|
for key in dic: |
|
if key not in new_states: |
|
new_states[key]=dic[key].copy() |
|
else: |
|
new_states[key]+=dic[key] |
|
for key in new_states: |
|
new_states[key]=sorted(list(set(new_states[key]))) |
|
return sorted(DFG,key=lambda x:x[1]),new_states |
|
elif root_node.type in for_statement: |
|
DFG=[] |
|
for i in range(2): |
|
left_nodes=[root_node.child_by_field_name('pattern')] |
|
right_nodes=[root_node.child_by_field_name('value')] |
|
assert len(right_nodes)==len(left_nodes) |
|
for node in right_nodes: |
|
temp,states=DFG_ruby(node,index_to_code,states) |
|
DFG+=temp |
|
for left_node,right_node in zip(left_nodes,right_nodes): |
|
left_tokens_index=tree_to_variable_index(left_node,index_to_code) |
|
right_tokens_index=tree_to_variable_index(right_node,index_to_code) |
|
temp=[] |
|
for token1_index in left_tokens_index: |
|
idx1,code1=index_to_code[token1_index] |
|
temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index], |
|
[index_to_code[x][0] for x in right_tokens_index])) |
|
states[code1]=[idx1] |
|
DFG+=temp |
|
temp,states=DFG_ruby(root_node.child_by_field_name('body'),index_to_code,states) |
|
DFG+=temp |
|
dic={} |
|
for x in DFG: |
|
if (x[0],x[1],x[2]) not in dic: |
|
dic[(x[0],x[1],x[2])]=[x[3],x[4]] |
|
else: |
|
dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) |
|
dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) |
|
DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in while_statement: |
|
DFG=[] |
|
for i in range(2): |
|
for child in root_node.children: |
|
temp,states=DFG_ruby(child,index_to_code,states) |
|
DFG+=temp |
|
dic={} |
|
for x in DFG: |
|
if (x[0],x[1],x[2]) not in dic: |
|
dic[(x[0],x[1],x[2])]=[x[3],x[4]] |
|
else: |
|
dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) |
|
dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) |
|
DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
else: |
|
DFG=[] |
|
for child in root_node.children: |
|
if child.type in do_first_statement: |
|
temp,states=DFG_ruby(child,index_to_code,states) |
|
DFG+=temp |
|
for child in root_node.children: |
|
if child.type not in do_first_statement: |
|
temp,states=DFG_ruby(child,index_to_code,states) |
|
DFG+=temp |
|
|
|
return sorted(DFG,key=lambda x:x[1]),states |
|
|
|
def DFG_go(root_node,index_to_code,states): |
|
assignment=['assignment_statement',] |
|
def_statement=['var_spec'] |
|
increment_statement=['inc_statement'] |
|
if_statement=['if_statement','else'] |
|
for_statement=['for_statement'] |
|
enhanced_for_statement=[] |
|
while_statement=[] |
|
do_first_statement=[] |
|
states=states.copy() |
|
if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': |
|
idx,code=index_to_code[(root_node.start_point,root_node.end_point)] |
|
if root_node.type==code: |
|
return [],states |
|
elif code in states: |
|
return [(code,idx,'comesFrom',[code],states[code].copy())],states |
|
else: |
|
if root_node.type=='identifier': |
|
states[code]=[idx] |
|
return [(code,idx,'comesFrom',[],[])],states |
|
elif root_node.type in def_statement: |
|
name=root_node.child_by_field_name('name') |
|
value=root_node.child_by_field_name('value') |
|
DFG=[] |
|
if value is None: |
|
indexs=tree_to_variable_index(name,index_to_code) |
|
for index in indexs: |
|
idx,code=index_to_code[index] |
|
DFG.append((code,idx,'comesFrom',[],[])) |
|
states[code]=[idx] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
else: |
|
name_indexs=tree_to_variable_index(name,index_to_code) |
|
value_indexs=tree_to_variable_index(value,index_to_code) |
|
temp,states=DFG_go(value,index_to_code,states) |
|
DFG+=temp |
|
for index1 in name_indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in value_indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'comesFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in assignment: |
|
left_nodes=root_node.child_by_field_name('left') |
|
right_nodes=root_node.child_by_field_name('right') |
|
DFG=[] |
|
temp,states=DFG_go(right_nodes,index_to_code,states) |
|
DFG+=temp |
|
name_indexs=tree_to_variable_index(left_nodes,index_to_code) |
|
value_indexs=tree_to_variable_index(right_nodes,index_to_code) |
|
for index1 in name_indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in value_indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in increment_statement: |
|
DFG=[] |
|
indexs=tree_to_variable_index(root_node,index_to_code) |
|
for index1 in indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in if_statement: |
|
DFG=[] |
|
current_states=states.copy() |
|
others_states=[] |
|
flag=False |
|
tag=False |
|
if 'else' in root_node.type: |
|
tag=True |
|
for child in root_node.children: |
|
if 'else' in child.type: |
|
tag=True |
|
if child.type not in if_statement and flag is False: |
|
temp,current_states=DFG_go(child,index_to_code,current_states) |
|
DFG+=temp |
|
else: |
|
flag=True |
|
temp,new_states=DFG_go(child,index_to_code,states) |
|
DFG+=temp |
|
others_states.append(new_states) |
|
others_states.append(current_states) |
|
if tag is False: |
|
others_states.append(states) |
|
new_states={} |
|
for dic in others_states: |
|
for key in dic: |
|
if key not in new_states: |
|
new_states[key]=dic[key].copy() |
|
else: |
|
new_states[key]+=dic[key] |
|
for key in states: |
|
if key not in new_states: |
|
new_states[key]=states[key] |
|
else: |
|
new_states[key]+=states[key] |
|
for key in new_states: |
|
new_states[key]=sorted(list(set(new_states[key]))) |
|
return sorted(DFG,key=lambda x:x[1]),new_states |
|
elif root_node.type in for_statement: |
|
DFG=[] |
|
for child in root_node.children: |
|
temp,states=DFG_go(child,index_to_code,states) |
|
DFG+=temp |
|
flag=False |
|
for child in root_node.children: |
|
if flag: |
|
temp,states=DFG_go(child,index_to_code,states) |
|
DFG+=temp |
|
elif child.type=="for_clause": |
|
if child.child_by_field_name('update') is not None: |
|
temp,states=DFG_go(child.child_by_field_name('update'),index_to_code,states) |
|
DFG+=temp |
|
flag=True |
|
dic={} |
|
for x in DFG: |
|
if (x[0],x[1],x[2]) not in dic: |
|
dic[(x[0],x[1],x[2])]=[x[3],x[4]] |
|
else: |
|
dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) |
|
dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) |
|
DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
else: |
|
DFG=[] |
|
for child in root_node.children: |
|
if child.type in do_first_statement: |
|
temp,states=DFG_go(child,index_to_code,states) |
|
DFG+=temp |
|
for child in root_node.children: |
|
if child.type not in do_first_statement: |
|
temp,states=DFG_go(child,index_to_code,states) |
|
DFG+=temp |
|
|
|
return sorted(DFG,key=lambda x:x[1]),states |
|
|
|
|
|
|
|
|
|
def DFG_php(root_node,index_to_code,states): |
|
assignment=['assignment_expression','augmented_assignment_expression'] |
|
def_statement=['simple_parameter'] |
|
increment_statement=['update_expression'] |
|
if_statement=['if_statement','else_clause'] |
|
for_statement=['for_statement'] |
|
enhanced_for_statement=['foreach_statement'] |
|
while_statement=['while_statement'] |
|
do_first_statement=[] |
|
states=states.copy() |
|
if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': |
|
idx,code=index_to_code[(root_node.start_point,root_node.end_point)] |
|
if root_node.type==code: |
|
return [],states |
|
elif code in states: |
|
return [(code,idx,'comesFrom',[code],states[code].copy())],states |
|
else: |
|
if root_node.type=='identifier': |
|
states[code]=[idx] |
|
return [(code,idx,'comesFrom',[],[])],states |
|
elif root_node.type in def_statement: |
|
name=root_node.child_by_field_name('name') |
|
value=root_node.child_by_field_name('default_value') |
|
DFG=[] |
|
if value is None: |
|
indexs=tree_to_variable_index(name,index_to_code) |
|
for index in indexs: |
|
idx,code=index_to_code[index] |
|
DFG.append((code,idx,'comesFrom',[],[])) |
|
states[code]=[idx] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
else: |
|
name_indexs=tree_to_variable_index(name,index_to_code) |
|
value_indexs=tree_to_variable_index(value,index_to_code) |
|
temp,states=DFG_php(value,index_to_code,states) |
|
DFG+=temp |
|
for index1 in name_indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in value_indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'comesFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in assignment: |
|
left_nodes=root_node.child_by_field_name('left') |
|
right_nodes=root_node.child_by_field_name('right') |
|
DFG=[] |
|
temp,states=DFG_php(right_nodes,index_to_code,states) |
|
DFG+=temp |
|
name_indexs=tree_to_variable_index(left_nodes,index_to_code) |
|
value_indexs=tree_to_variable_index(right_nodes,index_to_code) |
|
for index1 in name_indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in value_indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in increment_statement: |
|
DFG=[] |
|
indexs=tree_to_variable_index(root_node,index_to_code) |
|
for index1 in indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in if_statement: |
|
DFG=[] |
|
current_states=states.copy() |
|
others_states=[] |
|
flag=False |
|
tag=False |
|
if 'else' in root_node.type: |
|
tag=True |
|
for child in root_node.children: |
|
if 'else' in child.type: |
|
tag=True |
|
if child.type not in if_statement and flag is False: |
|
temp,current_states=DFG_php(child,index_to_code,current_states) |
|
DFG+=temp |
|
else: |
|
flag=True |
|
temp,new_states=DFG_php(child,index_to_code,states) |
|
DFG+=temp |
|
others_states.append(new_states) |
|
others_states.append(current_states) |
|
new_states={} |
|
for dic in others_states: |
|
for key in dic: |
|
if key not in new_states: |
|
new_states[key]=dic[key].copy() |
|
else: |
|
new_states[key]+=dic[key] |
|
for key in states: |
|
if key not in new_states: |
|
new_states[key]=states[key] |
|
else: |
|
new_states[key]+=states[key] |
|
for key in new_states: |
|
new_states[key]=sorted(list(set(new_states[key]))) |
|
return sorted(DFG,key=lambda x:x[1]),new_states |
|
elif root_node.type in for_statement: |
|
DFG=[] |
|
for child in root_node.children: |
|
temp,states=DFG_php(child,index_to_code,states) |
|
DFG+=temp |
|
flag=False |
|
for child in root_node.children: |
|
if flag: |
|
temp,states=DFG_php(child,index_to_code,states) |
|
DFG+=temp |
|
elif child.type=="assignment_expression": |
|
flag=True |
|
dic={} |
|
for x in DFG: |
|
if (x[0],x[1],x[2]) not in dic: |
|
dic[(x[0],x[1],x[2])]=[x[3],x[4]] |
|
else: |
|
dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) |
|
dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) |
|
DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in enhanced_for_statement: |
|
name=None |
|
value=None |
|
for child in root_node.children: |
|
if child.type=='variable_name' and value is None: |
|
value=child |
|
elif child.type=='variable_name' and name is None: |
|
name=child |
|
break |
|
body=root_node.child_by_field_name('body') |
|
DFG=[] |
|
for i in range(2): |
|
temp,states=DFG_php(value,index_to_code,states) |
|
DFG+=temp |
|
name_indexs=tree_to_variable_index(name,index_to_code) |
|
value_indexs=tree_to_variable_index(value,index_to_code) |
|
for index1 in name_indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in value_indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
temp,states=DFG_php(body,index_to_code,states) |
|
DFG+=temp |
|
dic={} |
|
for x in DFG: |
|
if (x[0],x[1],x[2]) not in dic: |
|
dic[(x[0],x[1],x[2])]=[x[3],x[4]] |
|
else: |
|
dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) |
|
dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) |
|
DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in while_statement: |
|
DFG=[] |
|
for i in range(2): |
|
for child in root_node.children: |
|
temp,states=DFG_php(child,index_to_code,states) |
|
DFG+=temp |
|
dic={} |
|
for x in DFG: |
|
if (x[0],x[1],x[2]) not in dic: |
|
dic[(x[0],x[1],x[2])]=[x[3],x[4]] |
|
else: |
|
dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) |
|
dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) |
|
DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
else: |
|
DFG=[] |
|
for child in root_node.children: |
|
if child.type in do_first_statement: |
|
temp,states=DFG_php(child,index_to_code,states) |
|
DFG+=temp |
|
for child in root_node.children: |
|
if child.type not in do_first_statement: |
|
temp,states=DFG_php(child,index_to_code,states) |
|
DFG+=temp |
|
|
|
return sorted(DFG,key=lambda x:x[1]),states |
|
|
|
|
|
def DFG_javascript(root_node,index_to_code,states): |
|
assignment=['assignment_pattern','augmented_assignment_expression'] |
|
def_statement=['variable_declarator'] |
|
increment_statement=['update_expression'] |
|
if_statement=['if_statement','else'] |
|
for_statement=['for_statement'] |
|
enhanced_for_statement=[] |
|
while_statement=['while_statement'] |
|
do_first_statement=[] |
|
states=states.copy() |
|
if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': |
|
idx,code=index_to_code[(root_node.start_point,root_node.end_point)] |
|
if root_node.type==code: |
|
return [],states |
|
elif code in states: |
|
return [(code,idx,'comesFrom',[code],states[code].copy())],states |
|
else: |
|
if root_node.type=='identifier': |
|
states[code]=[idx] |
|
return [(code,idx,'comesFrom',[],[])],states |
|
elif root_node.type in def_statement: |
|
name=root_node.child_by_field_name('name') |
|
value=root_node.child_by_field_name('value') |
|
DFG=[] |
|
if value is None: |
|
indexs=tree_to_variable_index(name,index_to_code) |
|
for index in indexs: |
|
idx,code=index_to_code[index] |
|
DFG.append((code,idx,'comesFrom',[],[])) |
|
states[code]=[idx] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
else: |
|
name_indexs=tree_to_variable_index(name,index_to_code) |
|
value_indexs=tree_to_variable_index(value,index_to_code) |
|
temp,states=DFG_javascript(value,index_to_code,states) |
|
DFG+=temp |
|
for index1 in name_indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in value_indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'comesFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in assignment: |
|
left_nodes=root_node.child_by_field_name('left') |
|
right_nodes=root_node.child_by_field_name('right') |
|
DFG=[] |
|
temp,states=DFG_javascript(right_nodes,index_to_code,states) |
|
DFG+=temp |
|
name_indexs=tree_to_variable_index(left_nodes,index_to_code) |
|
value_indexs=tree_to_variable_index(right_nodes,index_to_code) |
|
for index1 in name_indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in value_indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in increment_statement: |
|
DFG=[] |
|
indexs=tree_to_variable_index(root_node,index_to_code) |
|
for index1 in indexs: |
|
idx1,code1=index_to_code[index1] |
|
for index2 in indexs: |
|
idx2,code2=index_to_code[index2] |
|
DFG.append((code1,idx1,'computedFrom',[code2],[idx2])) |
|
states[code1]=[idx1] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in if_statement: |
|
DFG=[] |
|
current_states=states.copy() |
|
others_states=[] |
|
flag=False |
|
tag=False |
|
if 'else' in root_node.type: |
|
tag=True |
|
for child in root_node.children: |
|
if 'else' in child.type: |
|
tag=True |
|
if child.type not in if_statement and flag is False: |
|
temp,current_states=DFG_javascript(child,index_to_code,current_states) |
|
DFG+=temp |
|
else: |
|
flag=True |
|
temp,new_states=DFG_javascript(child,index_to_code,states) |
|
DFG+=temp |
|
others_states.append(new_states) |
|
others_states.append(current_states) |
|
if tag is False: |
|
others_states.append(states) |
|
new_states={} |
|
for dic in others_states: |
|
for key in dic: |
|
if key not in new_states: |
|
new_states[key]=dic[key].copy() |
|
else: |
|
new_states[key]+=dic[key] |
|
for key in states: |
|
if key not in new_states: |
|
new_states[key]=states[key] |
|
else: |
|
new_states[key]+=states[key] |
|
for key in new_states: |
|
new_states[key]=sorted(list(set(new_states[key]))) |
|
return sorted(DFG,key=lambda x:x[1]),new_states |
|
elif root_node.type in for_statement: |
|
DFG=[] |
|
for child in root_node.children: |
|
temp,states=DFG_javascript(child,index_to_code,states) |
|
DFG+=temp |
|
flag=False |
|
for child in root_node.children: |
|
if flag: |
|
temp,states=DFG_javascript(child,index_to_code,states) |
|
DFG+=temp |
|
elif child.type=="variable_declaration": |
|
flag=True |
|
dic={} |
|
for x in DFG: |
|
if (x[0],x[1],x[2]) not in dic: |
|
dic[(x[0],x[1],x[2])]=[x[3],x[4]] |
|
else: |
|
dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) |
|
dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) |
|
DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
elif root_node.type in while_statement: |
|
DFG=[] |
|
for i in range(2): |
|
for child in root_node.children: |
|
temp,states=DFG_javascript(child,index_to_code,states) |
|
DFG+=temp |
|
dic={} |
|
for x in DFG: |
|
if (x[0],x[1],x[2]) not in dic: |
|
dic[(x[0],x[1],x[2])]=[x[3],x[4]] |
|
else: |
|
dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3])) |
|
dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4]))) |
|
DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])] |
|
return sorted(DFG,key=lambda x:x[1]),states |
|
else: |
|
DFG=[] |
|
for child in root_node.children: |
|
if child.type in do_first_statement: |
|
temp,states=DFG_javascript(child,index_to_code,states) |
|
DFG+=temp |
|
for child in root_node.children: |
|
if child.type not in do_first_statement: |
|
temp,states=DFG_javascript(child,index_to_code,states) |
|
DFG+=temp |
|
|
|
return sorted(DFG,key=lambda x:x[1]),states |
|
|
|
dfg_function={ |
|
'python':DFG_python, |
|
'java':DFG_java, |
|
'ruby':DFG_ruby, |
|
'go':DFG_go, |
|
'php':DFG_php, |
|
'javascript':DFG_javascript, |
|
'c_sharp':DFG_csharp, |
|
'c':DFG_csharp, |
|
'cpp':DFG_csharp |
|
} |
|
|
|
def calc_dataflow_match(references, candidate, lang): |
|
return corpus_dataflow_match([references], [candidate], lang) |
|
|
|
def corpus_dataflow_match(references, candidates, lang): |
|
LANGUAGE = Language('parser/my-languages.so', lang) |
|
parser = Parser() |
|
parser.set_language(LANGUAGE) |
|
parser = [parser,dfg_function[lang]] |
|
match_count = 0 |
|
total_count = 0 |
|
|
|
for i in range(len(candidates)): |
|
references_sample = references[i] |
|
candidate = candidates[i] |
|
for reference in references_sample: |
|
try: |
|
candidate=remove_comments_and_docstrings(candidate,'java') |
|
except: |
|
pass |
|
try: |
|
reference=remove_comments_and_docstrings(reference,'java') |
|
except: |
|
pass |
|
|
|
cand_dfg = get_data_flow(candidate, parser) |
|
ref_dfg = get_data_flow(reference, parser) |
|
|
|
normalized_cand_dfg = normalize_dataflow(cand_dfg) |
|
normalized_ref_dfg = normalize_dataflow(ref_dfg) |
|
|
|
if len(normalized_ref_dfg) > 0: |
|
total_count += len(normalized_ref_dfg) |
|
for dataflow in normalized_ref_dfg: |
|
if dataflow in normalized_cand_dfg: |
|
match_count += 1 |
|
normalized_cand_dfg.remove(dataflow) |
|
if total_count == 0: |
|
print("WARNING: There is no reference data-flows extracted from the whole corpus, and the data-flow match score degenerates to 0. Please consider ignoring this score.") |
|
return 0 |
|
score = match_count / total_count |
|
return score |
|
|
|
def get_data_flow(code, parser): |
|
try: |
|
tree = parser[0].parse(bytes(code,'utf8')) |
|
root_node = tree.root_node |
|
tokens_index=tree_to_token_index(root_node) |
|
code=code.split('\n') |
|
code_tokens=[index_to_code_token(x,code) for x in tokens_index] |
|
index_to_code={} |
|
for idx,(index,code) in enumerate(zip(tokens_index,code_tokens)): |
|
index_to_code[index]=(idx,code) |
|
try: |
|
DFG,_=parser[1](root_node,index_to_code,{}) |
|
except: |
|
DFG=[] |
|
DFG=sorted(DFG,key=lambda x:x[1]) |
|
indexs=set() |
|
for d in DFG: |
|
if len(d[-1])!=0: |
|
indexs.add(d[1]) |
|
for x in d[-1]: |
|
indexs.add(x) |
|
new_DFG=[] |
|
for d in DFG: |
|
if d[1] in indexs: |
|
new_DFG.append(d) |
|
codes=code_tokens |
|
dfg=new_DFG |
|
except: |
|
codes=code.split() |
|
dfg=[] |
|
|
|
dic={} |
|
for d in dfg: |
|
if d[1] not in dic: |
|
dic[d[1]]=d |
|
else: |
|
dic[d[1]]=(d[0],d[1],d[2],list(set(dic[d[1]][3]+d[3])),list(set(dic[d[1]][4]+d[4]))) |
|
DFG=[] |
|
for d in dic: |
|
DFG.append(dic[d]) |
|
dfg=DFG |
|
return dfg |
|
|
|
def normalize_dataflow_item(dataflow_item): |
|
var_name = dataflow_item[0] |
|
var_pos = dataflow_item[1] |
|
relationship = dataflow_item[2] |
|
par_vars_name_list = dataflow_item[3] |
|
par_vars_pos_list = dataflow_item[4] |
|
|
|
var_names = list(set(par_vars_name_list+[var_name])) |
|
norm_names = {} |
|
for i in range(len(var_names)): |
|
norm_names[var_names[i]] = 'var_'+str(i) |
|
|
|
norm_var_name = norm_names[var_name] |
|
relationship = dataflow_item[2] |
|
norm_par_vars_name_list = [norm_names[x] for x in par_vars_name_list] |
|
|
|
return (norm_var_name, relationship, norm_par_vars_name_list) |
|
|
|
def normalize_dataflow(dataflow): |
|
var_dict = {} |
|
i = 0 |
|
normalized_dataflow = [] |
|
for item in dataflow: |
|
var_name = item[0] |
|
relationship = item[2] |
|
par_vars_name_list = item[3] |
|
for name in par_vars_name_list: |
|
if name not in var_dict: |
|
var_dict[name] = 'var_'+str(i) |
|
i += 1 |
|
if var_name not in var_dict: |
|
var_dict[var_name] = 'var_'+str(i) |
|
i+= 1 |
|
normalized_dataflow.append((var_dict[var_name], relationship, [var_dict[x] for x in par_vars_name_list])) |
|
return normalized_dataflow |
|
|
|
|