diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..7c163e2 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,15 @@ + +[run] +branch = True +source = sourced/ml/core + +[report] +exclude_lines = + no cover + raise NotImplementedError + if __name__ == "__main__": +ignore_errors = True +omit = + sourced/ml/core/tests/* + sourced/ml/core/swivel.py + sourced/ml/core/bigartm.py diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..d6f48de --- /dev/null +++ b/.flake8 @@ -0,0 +1,17 @@ +[flake8] +ignore=B008,E121,E123,E126,E203,E226,E24,E704,W503,W504,D100,D105,D200,D301,D402 +max-line-length=99 +exclude= + .git + doc +inline-quotes=" +import-order-style=appnexus +application-package-names=sourced.ml.core +per-file-ignores= + **/tests/**:D + # Should be resolved one by one + # Related issue: https://github.com/src-d/ml/issues/354 + ./sourced/ml/core/extractors/*:D + ./sourced/ml/core/models/**:D + ./sourced/ml/core/algorithms/**:D + ./sourced/ml/core/utils/*:D \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..14f6144 --- /dev/null +++ b/.gitignore @@ -0,0 +1,116 @@ + +#Mac OS +*.DS_Store + +#PyCharm IDE +.idea/ + +# Documentation build files +doc/_build/ +doc/ast2vec.rst +doc/modules.rst + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# CI +.ci \ No newline at end of file diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..731e424 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,10 @@ +[MASTER] +jobs=0 +load-plugins=pylint.extensions.docparams + +[MESSAGES CONTROL] +disable=all +enable=missing-param-doc, + differing-param-doc, + differing-type-doc, + missing-return-doc diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..5a480e2 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,52 @@ +language: python +sudo: true +dist: xenial +services: + - docker +cache: pip +before_cache: + - chown -R travis:travis $HOME/.cache/pip +stages: + - style + - test +_install: &_install + - travis_retry make bblfsh-start + - pip install --upgrade pip cython codecov + - ML_CORE_SETUP_INCLUDE_TESTS=1 pip install .[tf] + - cd $(pip show sourced.ml.core|grep Location|cut -d' ' -f2)/sourced/ml/core + - find . -wholename "*/tests/*" -type d -exec chmod 555 {} \; +_coverage: &_coverage + - coverage run --concurrency=multiprocessing -m unittest discover + - travis_retry coverage combine +matrix: + fast_finish: true + include: + - stage: style + python: 3.7 + script: + - make check + install: + - pip install -r requirements-lint.txt + - stage: test + python: 3.5 + script: *_coverage + install: *_install + - stage: test + python: 3.6 + script: *_coverage + install: *_install + - stage: test + python: 3.7 + script: *_coverage + install: *_install + after_success: + - codecov + - stage: test + name: Tests inside docker + script: + - make docker-build VERSION=test + - make docker-test VERSION=test + install: + - travis_retry make bblfsh-start +notifications: + email: false \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..26b9ba1 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,30 @@ +FROM ubuntu:18.04 + +ENV BROWSER=/browser \ + LC_ALL=en_US.UTF-8 + +COPY requirements.txt ml_core/requirements.txt + +RUN apt-get update && \ + apt-get install -y --no-install-suggests --no-install-recommends \ + ca-certificates locales libxml2 libxml2-dev gcc g++ wget \ + python3 python3-dev python3-distutils && \ + echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && \ + locale-gen && \ + wget -O - https://bootstrap.pypa.io/get-pip.py | python3 && \ + cd ml_core && \ + pip3 install --no-cache-dir -r requirements.txt && \ + apt-get remove -y python3-dev libxml2-dev gcc g++ wget && \ + apt-get remove -y .*-doc .*-man >/dev/null && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* && \ + echo '#!/bin/bash\n\ +\n\ +echo\n\ +echo " $@"\n\ +echo\n\' > /browser && \ + chmod +x /browser + +COPY . ml_core/ +RUN cd ml_core && pip3 install -e . diff --git a/license.md b/LICENSE.md similarity index 100% rename from license.md rename to LICENSE.md diff --git a/maintainers.md b/MAINTAINERS.md similarity index 100% rename from maintainers.md rename to MAINTAINERS.md diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..ab7b419 --- /dev/null +++ b/Makefile @@ -0,0 +1,38 @@ +current_dir = $(shell pwd) + +PROJECT = ml_core + +DOCKERFILES = Dockerfile:$(PROJECT) +DOCKER_ORG = "srcd" + +# Including ci Makefile +CI_REPOSITORY ?= https://github.com/src-d/ci.git +CI_BRANCH ?= v1 +CI_PATH ?= .ci +MAKEFILE := $(CI_PATH)/Makefile.main +$(MAKEFILE): + git clone --quiet --depth 1 -b $(CI_BRANCH) $(CI_REPOSITORY) $(CI_PATH); +-include $(MAKEFILE) + +.PHONY: check +check: + ! (grep -R /tmp sourced/ml/core/tests) + flake8 --count + pylint sourced + +.PHONY: test +test: + python3 -m unittest discover + +.PHONY: docker-test +docker-test: + docker ps | grep bblfshd # bblfsh server should be run. Try `make bblfsh-start` command. + docker run --rm -it --network host --entrypoint python3 -w /ml_core \ + -e SKIP_BBLFSH_UTILS_TESTS=1 \ + srcd/ml_core:$(VERSION) -m unittest discover + +.PHONY: bblfsh-start +bblfsh-start: + ! docker ps | grep bblfshd # bblfsh server should not be running already + docker run -d --name ml_core_bblfshd --privileged -p 9432\:9432 bblfsh/bblfshd\:v2.12.1 + docker exec -it ml_core_bblfshd bblfshctl driver install python bblfsh/python-driver\:v2.9.0 diff --git a/README.md b/README.md index a924da2..579c5c6 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,23 @@ # MLonCode Core Library - [![Build Status](https://travis-ci.org/src-d/ml-core.svg)](https://travis-ci.org/src-d/ml-core) - [![codecov](https://codecov.io/github/src-d/ml-core/coverage.svg)](https://codecov.io/gh/src-d/ml-core) - [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) + [![Travis build status](https://travis-ci.com/src-d/ml-core.svg)](https://travis-ci.com/src-d/ml-core) + [![Code coverage](https://codecov.io/github/src-d/ml-core/coverage.svg)](https://codecov.io/github/src-d/ml-core) + [![Read the Docs](https://img.shields.io/readthedocs/ml-core.svg)](https://readthedocs.org/projects/ml-core/) +[![Docker build status](https://img.shields.io/docker/build/srcd/ml-corer.svg)](https://hub.docker.com/r/srcd/ml-core) +[![PyPi package status](https://img.shields.io/pypi/v/srcd-ml-core.svg)](https://pypi.python.org/pypi/srcd-ml-core) +![stability: alpha](https://svg-badge.appspot.com/badge/stability/alpha?color=f47142) +[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) -Library for machine learning on source code. Provides commonly used algorithms and tools - to process the code-related data, such as: Babelfish's UASTs, plain code text, etc. \ No newline at end of file +This library is the foundation for [MLonCode](https://github.com/src-d/awesome-machine-learning-on-source-code) research and development. +It contains commonly used algorithms and tools to process the code-related data, such as [Babelfish's UASTs](docs.sourced.tech/babelfish), plain code text and other. + +## Contributions + +...are welcome! See [CONTRIBUTING.md](docs/CONTRIBUTING.md) and [CODE\_OF\_CONDUCT.md](docs/CODE_OF_CONDUCT.md). + +## License + +[Apache 2.0](LICENSE.md) + +## Glossary + +See [here](docs/GLOSSARY.md). \ No newline at end of file diff --git a/SUMMARY.md b/SUMMARY.md deleted file mode 100644 index 9824e7f..0000000 --- a/SUMMARY.md +++ /dev/null @@ -1,16 +0,0 @@ -# Table of contents - -* [README](README.md) -* [doc](doc/README.md) - * [neural\_splitter\_arch](doc/neural_splitter_arch.md) - * [topic\_modeling](doc/topic_modeling.md) - * [cmd](doc/cmd/README.md) - * [Preprocrepos command](doc/cmd/preprocrepos.md) - * [README](doc/proposals/README.md) - * [MLIP-000](doc/proposals/mlip-000.md) - * [spark](doc/spark.md) -* [LICENSE](license.md) -* [MAINTAINERS](maintainers.md) -* [CODE\_OF\_CONDUCT](code_of_conduct.md) -* [CONTRIBUTING](contributing.md) - diff --git a/code_of_conduct.md b/docs/CODE_OF_CONDUCT.md similarity index 100% rename from code_of_conduct.md rename to docs/CODE_OF_CONDUCT.md diff --git a/contributing.md b/docs/CONTRIBUTING.md similarity index 93% rename from contributing.md rename to docs/CONTRIBUTING.md index 458c5b9..6df2b3c 100644 --- a/contributing.md +++ b/docs/CONTRIBUTING.md @@ -1,6 +1,6 @@ # CONTRIBUTING -ml-core project is [Apache licensed](license.md) and accepts contributions via GitHub pull +ml-core project is [Apache licensed](LICENSE.md) and accepts contributions via GitHub pull requests. This document outlines some of the conventions on development workflow, commit message formatting, contact points, and other resources to make it easier to get your contribution accepted. ## Certificate of Origin @@ -15,7 +15,7 @@ This can be done easily using the [`-s`](https://github.com/git/git/blob/b2c150d The official support channels, for both users and contributors, are: -* GitHub [issues](https://github.com/src-d/ml/issues)\* +* GitHub [issues](https://github.com/src-d/ml-core/issues)\* * Slack: \#machine-learning room in the [source{d} Slack](https://join.slack.com/t/sourced-community/shared_invite/enQtMjc4Njk5MzEyNzM2LTFjNzY4NjEwZGEwMzRiNTM4MzRlMzQ4MmIzZjkwZmZlM2NjODUxZmJjNDI1OTcxNDAyMmZlNmFjODZlNTg0YWM) \*Before opening a new issue or submitting a new pull request, it's helpful to search the project - it's likely that another user has already reported the issue you're facing, or it's a known issue that we're already aware of. @@ -29,7 +29,7 @@ Pull Requests \(PRs\) are the main and exclusive way to contribute to the offici * The code is formatted according to [![PEP8](https://img.shields.io/badge/code%20style-pep8-orange.svg)](https://www.python.org/dev/peps/pep-0008/). * If the PR is a bug fix, it has to include a new unit test that fails before the patch is merged. * If the PR is a new feature, it has to come with a suite of unit tests, that tests the new functionality. -* In any case, all the PRs have to pass the personal evaluation of at least one of the [maintainers](maintainers.md). +* In any case, all the PRs have to pass the personal evaluation of at least one of the [maintainers](MAINTAINERS.md). ### Format of the commit message diff --git a/GLOSSARY.md b/docs/GLOSSARY.md similarity index 100% rename from GLOSSARY.md rename to docs/GLOSSARY.md diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md new file mode 100644 index 0000000..df93a5c --- /dev/null +++ b/docs/SUMMARY.md @@ -0,0 +1,8 @@ +# Table of contents + +* [README](README.md) +* [LICENSE](LICENSE.md) +* [MAINTAINERS](MAINTAINERS.md) +* [CODE\_OF\_CONDUCT](docs/CODE_OF_CONDUCT.md) +* [CONTRIBUTING](docs/CONTRIBUTING.md) + diff --git a/requirements-lint.txt b/requirements-lint.txt new file mode 100644 index 0000000..e89a272 --- /dev/null +++ b/requirements-lint.txt @@ -0,0 +1,7 @@ +flake8==3.5.0 +flake8-bugbear==18.8.0 +flake8-docstrings==1.3.0 +flake8-import-order==0.18.1 +flake8-quotes==1.0.0 +flake8-per-file-ignores==0.8.1 +pylint==2.3.1 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e8fb6db --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +Cython>=0.28,<1.0; python_version == '3.7' +PyStemmer==1.3.0 +bblfsh>=2.12.7,<3.0 +modelforge==0.12.1 +pygments==2.3.1 +keras==2.2.4 +scikit-learn==0.20.3 +tqdm==4.31.1 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..77bd0e3 --- /dev/null +++ b/setup.py @@ -0,0 +1,72 @@ +from importlib.machinery import SourceFileLoader +import io +import os.path + +from setuptools import find_packages, setup + +sourcedml = SourceFileLoader("sourced-ml-core", "./sourced/ml/core/__init__.py").load_module() + +with io.open(os.path.join(os.path.dirname(__file__), "README.md"), encoding="utf-8") as f: + long_description = f.read() + +tf_requires = ["tensorflow>=1.0,<2.0"] +tf_gpu_requires = ["tensorflow-gpu>=1.0,<2.0"] +exclude_packages = ( + ("sourced.ml.core.tests", "sourced.ml.core.tests.source") + if not os.getenv("ML_CORE_SETUP_INCLUDE_TESTS", False) + else () +) + + +setup( + name="sourced-ml-core", + description="Library containing the core algorithms for machine learning on source code. " + "Provides API and tools to train and use models based " + "on source code features extracted from Babelfish's UASTs.", + long_description=long_description, + long_description_content_type="text/markdown", + version=sourcedml.__version__, + license="Apache 2.0", + author="source{d}", + author_email="machine-learning@sourced.tech", + url="https://github.com/src-d/ml-core", + download_url="https://github.com/src-d/ml-core", + packages=find_packages(exclude=exclude_packages), + keywords=[ + "machine learning on source code", + "word2vec", + "id2vec", + "github", + "swivel", + "bow", + "bblfsh", + "babelfish", + ], + install_requires=[ + "PyStemmer>=1.3,<2.0", + "bblfsh>=2.12.7,<3.0", + "modelforge>=0.12.1,<0.13", + "pygments>=2.2.0,<3.0", + "keras>=2.0,<3.0", + "scikit-learn>=0.19,<1.0", + "tqdm>=4.20,<5.0", + ], + extras_require={"tf": tf_requires, "tf_gpu": tf_gpu_requires}, + tests_require=["docker>=3.6.0,<4.0"], + package_data={ + "": ["LICENSE.md", "README.md"], + "sourced.ml.core.tests": ["./asdf/*.asdf", "./swivel/*", "identifiers.csv.tar.gz"], + }, + python_requires=">=3.5", + classifiers=[ + "Development Status :: 3 - Alpha", + "Environment :: Console", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Operating System :: POSIX", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Topic :: Software Development :: Libraries", + ], +) diff --git a/sourced/__init__.py b/sourced/__init__.py new file mode 100644 index 0000000..acb8973 --- /dev/null +++ b/sourced/__init__.py @@ -0,0 +1 @@ +"""Common namespace for sourced tools.""" diff --git a/sourced/ml/__init__.py b/sourced/ml/__init__.py new file mode 100644 index 0000000..3f47164 --- /dev/null +++ b/sourced/ml/__init__.py @@ -0,0 +1 @@ +"""MLonCode research playground.""" diff --git a/sourced/ml/core/__init__.py b/sourced/ml/core/__init__.py new file mode 100644 index 0000000..83cf02c --- /dev/null +++ b/sourced/ml/core/__init__.py @@ -0,0 +1,9 @@ +"""MLonCode research playground.""" +try: + import modelforge.configuration + + modelforge.configuration.refresh() +except ImportError: + pass + +__version__ = "0.8.3"