|
|
|
|
|
|
|
import re |
|
from io import StringIO |
|
import tokenize |
|
def remove_comments_and_docstrings(source,lang): |
|
if lang in ['python']: |
|
""" |
|
Returns 'source' minus comments and docstrings. |
|
""" |
|
io_obj = StringIO(source) |
|
out = "" |
|
prev_toktype = tokenize.INDENT |
|
last_lineno = -1 |
|
last_col = 0 |
|
for tok in tokenize.generate_tokens(io_obj.readline): |
|
token_type = tok[0] |
|
token_string = tok[1] |
|
start_line, start_col = tok[2] |
|
end_line, end_col = tok[3] |
|
ltext = tok[4] |
|
if start_line > last_lineno: |
|
last_col = 0 |
|
if start_col > last_col: |
|
out += (" " * (start_col - last_col)) |
|
|
|
if token_type == tokenize.COMMENT: |
|
pass |
|
|
|
elif token_type == tokenize.STRING: |
|
if prev_toktype != tokenize.INDENT: |
|
|
|
if prev_toktype != tokenize.NEWLINE: |
|
if start_col > 0: |
|
out += token_string |
|
else: |
|
out += token_string |
|
prev_toktype = token_type |
|
last_col = end_col |
|
last_lineno = end_line |
|
temp=[] |
|
for x in out.split('\n'): |
|
if x.strip()!="": |
|
temp.append(x) |
|
return '\n'.join(temp) |
|
elif lang in ['ruby']: |
|
return source |
|
else: |
|
def replacer(match): |
|
s = match.group(0) |
|
if s.startswith('/'): |
|
return " " |
|
else: |
|
return s |
|
pattern = re.compile( |
|
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', |
|
re.DOTALL | re.MULTILINE |
|
) |
|
temp=[] |
|
for x in re.sub(pattern, replacer, source).split('\n'): |
|
if x.strip()!="": |
|
temp.append(x) |
|
return '\n'.join(temp) |
|
|
|
def tree_to_token_index(root_node): |
|
if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': |
|
return [(root_node.start_point,root_node.end_point)] |
|
else: |
|
code_tokens=[] |
|
for child in root_node.children: |
|
code_tokens+=tree_to_token_index(child) |
|
return code_tokens |
|
|
|
def tree_to_variable_index(root_node,index_to_code): |
|
if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': |
|
index=(root_node.start_point,root_node.end_point) |
|
_,code=index_to_code[index] |
|
if root_node.type!=code: |
|
return [(root_node.start_point,root_node.end_point)] |
|
else: |
|
return [] |
|
else: |
|
code_tokens=[] |
|
for child in root_node.children: |
|
code_tokens+=tree_to_variable_index(child,index_to_code) |
|
return code_tokens |
|
|
|
def index_to_code_token(index,code): |
|
start_point=index[0] |
|
end_point=index[1] |
|
if start_point[0]==end_point[0]: |
|
s=code[start_point[0]][start_point[1]:end_point[1]] |
|
else: |
|
s="" |
|
s+=code[start_point[0]][start_point[1]:] |
|
for i in range(start_point[0]+1,end_point[0]): |
|
s+=code[i] |
|
s+=code[end_point[0]][:end_point[1]] |
|
return s |
|
|